python-hwpx 2.13.0__py3-none-any.whl → 2.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hwpx/__init__.py +2 -0
- hwpx/authoring.py +94 -0
- hwpx/builder/__init__.py +8 -1
- hwpx/builder/core.py +105 -1
- hwpx/builder/report.py +80 -0
- hwpx/conformance/roundtrip_batch.py +171 -0
- hwpx/document.py +21 -3
- hwpx/exam/__init__.py +22 -0
- hwpx/exam/compose.py +237 -0
- hwpx/exam/ir.py +41 -0
- hwpx/exam/measure.py +147 -0
- hwpx/exam/parser.py +145 -0
- hwpx/exam/profile.py +116 -0
- hwpx/form_fit/seal.py +451 -0
- hwpx/form_fit/wordbox.py +1212 -0
- hwpx/opc/package.py +12 -5
- hwpx/oxml/_document_impl.py +60 -6
- hwpx/oxml/body.py +45 -0
- hwpx/oxml/canonical_defaults.py +95 -0
- hwpx/oxml/header.py +16 -2
- hwpx/oxml/namespaces.py +16 -3
- hwpx/oxml/utils.py +10 -2
- hwpx/tools/id_integrity.py +4 -1
- hwpx/tools/idempotence.py +139 -0
- hwpx/tools/ir_equality.py +137 -0
- hwpx/tools/mail_merge.py +197 -4
- hwpx/tools/package_reconcile.py +72 -0
- hwpx/tools/package_validator.py +16 -6
- hwpx/tools/validator.py +6 -3
- hwpx/visual/oracle.py +72 -0
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/METADATA +3 -1
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/RECORD +37 -24
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/WHEEL +0 -0
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/entry_points.txt +0 -0
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/licenses/LICENSE +0 -0
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/licenses/NOTICE +0 -0
- {python_hwpx-2.13.0.dist-info → python_hwpx-2.15.0.dist-info}/top_level.txt +0 -0
hwpx/__init__.py
CHANGED
|
@@ -94,6 +94,7 @@ from .authoring import (
|
|
|
94
94
|
PlanValidationIssue,
|
|
95
95
|
PlanValidationReport,
|
|
96
96
|
create_document_from_plan,
|
|
97
|
+
get_document_plan_schema,
|
|
97
98
|
inspect_document_authoring_quality,
|
|
98
99
|
inspect_operating_plan_quality,
|
|
99
100
|
normalize_document_plan,
|
|
@@ -121,6 +122,7 @@ __all__ = [
|
|
|
121
122
|
"DEFAULT_NAMESPACES",
|
|
122
123
|
"DEFAULT_STYLE_PRESET",
|
|
123
124
|
"DOCUMENT_PLAN_SCHEMA_VERSION",
|
|
125
|
+
"get_document_plan_schema",
|
|
124
126
|
"DocumentBlock",
|
|
125
127
|
"DocumentPlan",
|
|
126
128
|
"DocumentStylePreset",
|
hwpx/authoring.py
CHANGED
|
@@ -265,6 +265,27 @@ def _plan_issue(
|
|
|
265
265
|
)
|
|
266
266
|
|
|
267
267
|
|
|
268
|
+
_PLAN_FAMILY_PREFIX = "hwpx.document_plan.v"
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _is_forward_plan_version(version: str) -> bool:
|
|
272
|
+
"""True if *version* is a newer same-family plan schema (forward-compat).
|
|
273
|
+
|
|
274
|
+
e.g. ``hwpx.document_plan.v3`` when the latest known is v2 — validate
|
|
275
|
+
best-effort with a warning rather than hard-rejecting.
|
|
276
|
+
"""
|
|
277
|
+
if not version.startswith(_PLAN_FAMILY_PREFIX):
|
|
278
|
+
return False
|
|
279
|
+
suffix = version[len(_PLAN_FAMILY_PREFIX):]
|
|
280
|
+
if not suffix.isdigit():
|
|
281
|
+
return False
|
|
282
|
+
latest_known = max(
|
|
283
|
+
int(DOCUMENT_PLAN_SCHEMA_VERSION.rsplit("v", 1)[-1]),
|
|
284
|
+
int(DOCUMENT_PLAN_V2_SCHEMA_VERSION.rsplit("v", 1)[-1]),
|
|
285
|
+
)
|
|
286
|
+
return int(suffix) > latest_known
|
|
287
|
+
|
|
288
|
+
|
|
268
289
|
def _plan_validation_report(
|
|
269
290
|
issues: list[PlanValidationIssue],
|
|
270
291
|
*,
|
|
@@ -449,6 +470,54 @@ def _plan_repair_hints(issues: tuple[PlanValidationIssue, ...]) -> list[dict[str
|
|
|
449
470
|
return hints
|
|
450
471
|
|
|
451
472
|
|
|
473
|
+
DOCUMENT_PLAN_SCHEMA_ID = "https://airmang.github.io/hwpx-plugins/schemas/document_plan.schema.json"
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def get_document_plan_schema() -> dict[str, Any]:
|
|
477
|
+
"""Return a JSON Schema (draft 2020-12) for the declarative document plan.
|
|
478
|
+
|
|
479
|
+
Built live from the validator's own constants so it never drifts from the
|
|
480
|
+
accepted contract. Usable directly as an LLM Structured-Outputs / external
|
|
481
|
+
JSON-Schema-validation contract: it constrains the envelope (schemaVersion,
|
|
482
|
+
a non-empty ``blocks`` array, each block carrying a known ``type``) while
|
|
483
|
+
leaving block bodies open (``additionalProperties``) for forward-compat.
|
|
484
|
+
"""
|
|
485
|
+
|
|
486
|
+
return {
|
|
487
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
488
|
+
"$id": DOCUMENT_PLAN_SCHEMA_ID,
|
|
489
|
+
"title": "HWPX Document Plan",
|
|
490
|
+
"type": "object",
|
|
491
|
+
"required": ["schemaVersion", "blocks"],
|
|
492
|
+
"additionalProperties": True,
|
|
493
|
+
"properties": {
|
|
494
|
+
"schemaVersion": {
|
|
495
|
+
"type": "string",
|
|
496
|
+
"enum": [DOCUMENT_PLAN_SCHEMA_VERSION, DOCUMENT_PLAN_V2_SCHEMA_VERSION],
|
|
497
|
+
"description": "Plan schema version. Newer same-family versions validate best-effort.",
|
|
498
|
+
},
|
|
499
|
+
"title": {"type": "string"},
|
|
500
|
+
"metadata": {"type": "object"},
|
|
501
|
+
"blocks": {
|
|
502
|
+
"type": "array",
|
|
503
|
+
"minItems": 1,
|
|
504
|
+
"items": {
|
|
505
|
+
"type": "object",
|
|
506
|
+
"required": ["type"],
|
|
507
|
+
"additionalProperties": True,
|
|
508
|
+
"properties": {
|
|
509
|
+
"type": {
|
|
510
|
+
"type": "string",
|
|
511
|
+
"enum": sorted(_SUPPORTED_BLOCK_TYPES),
|
|
512
|
+
"description": "Block kind. Body fields depend on the type.",
|
|
513
|
+
}
|
|
514
|
+
},
|
|
515
|
+
},
|
|
516
|
+
},
|
|
517
|
+
},
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
|
|
452
521
|
def validate_document_plan(plan: Mapping[str, Any]) -> PlanValidationReport:
|
|
453
522
|
"""Return validation errors for a ``hwpx.document_plan.v1`` mapping."""
|
|
454
523
|
|
|
@@ -472,6 +541,31 @@ def validate_document_plan(plan: Mapping[str, Any]) -> PlanValidationReport:
|
|
|
472
541
|
|
|
473
542
|
schema_version = str(plan.get("schemaVersion") or "").strip()
|
|
474
543
|
if schema_version not in {DOCUMENT_PLAN_SCHEMA_VERSION, DOCUMENT_PLAN_V2_SCHEMA_VERSION}:
|
|
544
|
+
if _is_forward_plan_version(schema_version):
|
|
545
|
+
# Forward-compat: a newer same-family version warns and validates as
|
|
546
|
+
# the latest known schema (best-effort) instead of hard-rejecting, so
|
|
547
|
+
# a plan emitted against a newer schema still generates. Unknown newer
|
|
548
|
+
# fields are simply ignored by the v2 validator.
|
|
549
|
+
issues.append(
|
|
550
|
+
_plan_issue(
|
|
551
|
+
"forward_schema_version",
|
|
552
|
+
"schemaVersion",
|
|
553
|
+
(
|
|
554
|
+
f"schemaVersion {schema_version!r} is newer than the latest "
|
|
555
|
+
f"known {DOCUMENT_PLAN_V2_SCHEMA_VERSION!r}; validating as "
|
|
556
|
+
"latest known (best-effort)."
|
|
557
|
+
),
|
|
558
|
+
severity="warning",
|
|
559
|
+
suggestion="Unknown newer fields are ignored; verify the output.",
|
|
560
|
+
)
|
|
561
|
+
)
|
|
562
|
+
v2_report = _validate_document_plan_v2(
|
|
563
|
+
plan, schema_version=DOCUMENT_PLAN_V2_SCHEMA_VERSION
|
|
564
|
+
)
|
|
565
|
+
return _plan_validation_report(
|
|
566
|
+
[*issues, *v2_report.issues],
|
|
567
|
+
schema_version=schema_version,
|
|
568
|
+
)
|
|
475
569
|
issues.append(
|
|
476
570
|
_plan_issue(
|
|
477
571
|
"invalid_schema_version",
|
hwpx/builder/__init__.py
CHANGED
|
@@ -20,10 +20,17 @@ from .core import (
|
|
|
20
20
|
Table,
|
|
21
21
|
approval_box,
|
|
22
22
|
)
|
|
23
|
-
from .report import
|
|
23
|
+
from .report import (
|
|
24
|
+
FIDELITY_CONTRACT,
|
|
25
|
+
BuilderSaveReport,
|
|
26
|
+
BuilderVerifyReport,
|
|
27
|
+
ReopenReport,
|
|
28
|
+
)
|
|
24
29
|
|
|
25
30
|
__all__ = [
|
|
31
|
+
"FIDELITY_CONTRACT",
|
|
26
32
|
"BuilderSaveReport",
|
|
33
|
+
"BuilderVerifyReport",
|
|
27
34
|
"Bullet",
|
|
28
35
|
"Document",
|
|
29
36
|
"Footer",
|
hwpx/builder/core.py
CHANGED
|
@@ -7,11 +7,14 @@ from pathlib import Path
|
|
|
7
7
|
from typing import Any, Mapping, Sequence
|
|
8
8
|
|
|
9
9
|
from hwpx.document import HwpxDocument
|
|
10
|
+
from hwpx.tools.id_integrity import check_id_integrity
|
|
11
|
+
from hwpx.tools.idempotence import IdempotenceReport, check_idempotent_pair
|
|
12
|
+
from hwpx.tools.package_reconcile import reconcile_package_with_document
|
|
10
13
|
from hwpx.tools.package_validator import validate_editor_open_safety
|
|
11
14
|
from hwpx.tools.package_validator import validate_package
|
|
12
15
|
from hwpx.tools.validator import validate_document
|
|
13
16
|
|
|
14
|
-
from .report import BuilderSaveReport, ReopenReport
|
|
17
|
+
from .report import BuilderSaveReport, BuilderVerifyReport, ReopenReport
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
BuilderChild = (
|
|
@@ -805,3 +808,104 @@ class Document:
|
|
|
805
808
|
visual_complete=visual_complete,
|
|
806
809
|
)
|
|
807
810
|
return report
|
|
811
|
+
|
|
812
|
+
def verify(self) -> BuilderVerifyReport:
|
|
813
|
+
"""Dry, no-disk pre-write verification of the built document.
|
|
814
|
+
|
|
815
|
+
Lowers the document to bytes in memory and runs the save hard gates
|
|
816
|
+
(package, document, editor-open-safety, reopen) *plus* id-integrity and
|
|
817
|
+
a two-round idempotence check — a strictly stronger gate set than
|
|
818
|
+
:meth:`save_to_path` (whose report leaves id-integrity to the reader and
|
|
819
|
+
does not check idempotence) — without writing any file. Returns a
|
|
820
|
+
compact signal so a caller can branch on ``ok`` and read a
|
|
821
|
+
section/paragraph count before paying to materialize a real save.
|
|
822
|
+
|
|
823
|
+
Serialization itself can fail (e.g. open-safety rejects the output); in
|
|
824
|
+
that case this returns ``ok=False`` with ``serialize_error`` set rather
|
|
825
|
+
than raising, so a caller (fuzz loop, agent) can always branch on the
|
|
826
|
+
result.
|
|
827
|
+
|
|
828
|
+
See :data:`hwpx.builder.report.FIDELITY_CONTRACT` for what a green
|
|
829
|
+
verdict proves vs. does not prove.
|
|
830
|
+
"""
|
|
831
|
+
|
|
832
|
+
try:
|
|
833
|
+
lowered = self.lower()
|
|
834
|
+
data = lowered.to_bytes()
|
|
835
|
+
except Exception as exc: # the document cannot even be serialized
|
|
836
|
+
return BuilderVerifyReport(
|
|
837
|
+
ok=False,
|
|
838
|
+
reopen_ok=False,
|
|
839
|
+
package_ok=False,
|
|
840
|
+
document_ok=False,
|
|
841
|
+
editor_open_safety_ok=False,
|
|
842
|
+
id_integrity_ok=False,
|
|
843
|
+
idempotent=False,
|
|
844
|
+
sections_reconciled=False,
|
|
845
|
+
serialize_error=f"{type(exc).__name__}: {exc}",
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
package_report = validate_package(data)
|
|
849
|
+
document_report = validate_document(data)
|
|
850
|
+
editor_open_safety_report = validate_editor_open_safety(data)
|
|
851
|
+
|
|
852
|
+
reopened: HwpxDocument | None = None
|
|
853
|
+
reopen_error: str | None = None
|
|
854
|
+
try:
|
|
855
|
+
reopened = HwpxDocument.open(data)
|
|
856
|
+
except Exception as exc: # surfaced in the report rather than raised
|
|
857
|
+
reopen_error = f"{type(exc).__name__}: {exc}"
|
|
858
|
+
|
|
859
|
+
id_integrity = (
|
|
860
|
+
check_id_integrity(reopened) if reopened is not None else None
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
# Fixed-point check on the EXACT bytes the gates above validated (gen-1)
|
|
864
|
+
# vs. their reopen-and-resave (gen-2), so the idempotence verdict refers
|
|
865
|
+
# to the bytes we would actually write, not a later generation.
|
|
866
|
+
idempotence: IdempotenceReport | None = None
|
|
867
|
+
serialize_error: str | None = None
|
|
868
|
+
try:
|
|
869
|
+
idempotence = check_idempotent_pair(data, HwpxDocument.open(data).to_bytes())
|
|
870
|
+
except Exception as exc:
|
|
871
|
+
serialize_error = f"{type(exc).__name__}: {exc}"
|
|
872
|
+
|
|
873
|
+
# Output-vs-intent: produced section parts must match the source model.
|
|
874
|
+
reconcile = reconcile_package_with_document(data, lowered)
|
|
875
|
+
|
|
876
|
+
package_ok = bool(getattr(package_report, "ok", False))
|
|
877
|
+
document_ok = bool(getattr(document_report, "ok", False))
|
|
878
|
+
editor_open_safety_ok = bool(getattr(editor_open_safety_report, "ok", False))
|
|
879
|
+
id_integrity_ok = bool(getattr(id_integrity, "ok", False))
|
|
880
|
+
idempotent = bool(idempotence is not None and idempotence.ok)
|
|
881
|
+
reopen_ok = reopened is not None
|
|
882
|
+
section_count = len(reopened.sections) if reopened is not None else 0
|
|
883
|
+
paragraph_count = len(reopened.paragraphs) if reopened is not None else 0
|
|
884
|
+
|
|
885
|
+
ok = (
|
|
886
|
+
package_ok
|
|
887
|
+
and document_ok
|
|
888
|
+
and editor_open_safety_ok
|
|
889
|
+
and id_integrity_ok
|
|
890
|
+
and reopen_ok
|
|
891
|
+
and idempotent
|
|
892
|
+
and reconcile.ok
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
return BuilderVerifyReport(
|
|
896
|
+
ok=ok,
|
|
897
|
+
reopen_ok=reopen_ok,
|
|
898
|
+
package_ok=package_ok,
|
|
899
|
+
document_ok=document_ok,
|
|
900
|
+
editor_open_safety_ok=editor_open_safety_ok,
|
|
901
|
+
id_integrity_ok=id_integrity_ok,
|
|
902
|
+
idempotent=idempotent,
|
|
903
|
+
sections_reconciled=reconcile.ok,
|
|
904
|
+
section_count=section_count,
|
|
905
|
+
paragraph_count=paragraph_count,
|
|
906
|
+
byte_length=len(data),
|
|
907
|
+
reopen_error=reopen_error,
|
|
908
|
+
serialize_error=serialize_error,
|
|
909
|
+
idempotence=idempotence,
|
|
910
|
+
reconcile=reconcile,
|
|
911
|
+
)
|
hwpx/builder/report.py
CHANGED
|
@@ -7,10 +7,34 @@ from typing import Any
|
|
|
7
7
|
|
|
8
8
|
from hwpx.quality import VisualCompleteReport
|
|
9
9
|
from hwpx.tools.id_integrity import IdIntegrityReport, check_id_integrity
|
|
10
|
+
from hwpx.tools.idempotence import IdempotenceReport
|
|
11
|
+
from hwpx.tools.package_reconcile import PackageReconcileReport
|
|
10
12
|
from hwpx.tools.package_validator import EditorOpenSafetyReport, PackageValidationReport
|
|
11
13
|
from hwpx.tools.validator import ValidationReport
|
|
12
14
|
|
|
13
15
|
|
|
16
|
+
# Explicit scope of what the builder's automated gates prove vs. don't, so a
|
|
17
|
+
# green ``hard_gates`` is never mistaken for full Hancom/visual fidelity. The
|
|
18
|
+
# gates answer "will Hancom likely open this", NOT "did every authored element
|
|
19
|
+
# round-trip". Surfaced in every report's ``to_dict()``.
|
|
20
|
+
FIDELITY_CONTRACT: dict[str, list[str]] = {
|
|
21
|
+
"proves": [
|
|
22
|
+
"package opens as a valid HWPX (mimetype/OPC structure, required entries)",
|
|
23
|
+
"no dangling id references or orphan BinData (id_integrity)",
|
|
24
|
+
"no known editor-open breakage patterns (editor_open_safety)",
|
|
25
|
+
"re-saving reproduces identical part contents (idempotent serialization)",
|
|
26
|
+
"the document reopens with our reader (reopen)",
|
|
27
|
+
],
|
|
28
|
+
"does_not_prove": [
|
|
29
|
+
"visual layout fidelity in Hancom (line/page breaks, overlap) — needs the "
|
|
30
|
+
"visual oracle / ComputerUse",
|
|
31
|
+
"every authored element round-tripped byte-for-byte: merges, shapes, BinData "
|
|
32
|
+
"bytes, and equation script are not value-diffed",
|
|
33
|
+
"macOS Hancom acceptance for untested element combinations",
|
|
34
|
+
],
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
14
38
|
@dataclass(frozen=True)
|
|
15
39
|
class ReopenReport:
|
|
16
40
|
"""Result of reopening a generated document."""
|
|
@@ -56,6 +80,10 @@ class BuilderSaveReport:
|
|
|
56
80
|
"path": str(self.path),
|
|
57
81
|
"metadata": dict(self.metadata or {}),
|
|
58
82
|
"hard_gates": dict(self.hard_gates),
|
|
83
|
+
"fidelity_contract": {
|
|
84
|
+
"proves": list(FIDELITY_CONTRACT["proves"]),
|
|
85
|
+
"does_not_prove": list(FIDELITY_CONTRACT["does_not_prove"]),
|
|
86
|
+
},
|
|
59
87
|
"visual_review_required": self.visual_review_required,
|
|
60
88
|
"feature_flags": dict(self.feature_flags),
|
|
61
89
|
"visual_complete": (
|
|
@@ -113,3 +141,55 @@ class BuilderSaveReport:
|
|
|
113
141
|
}
|
|
114
142
|
),
|
|
115
143
|
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass(frozen=True)
|
|
147
|
+
class BuilderVerifyReport:
|
|
148
|
+
"""Compact, no-disk pre-write verification signal from ``Document.verify()``.
|
|
149
|
+
|
|
150
|
+
Lowers the built document to bytes in memory and runs the same hard gates as
|
|
151
|
+
a real save plus a two-round idempotence check — without writing a file — so
|
|
152
|
+
a caller (agent, fuzz loop) can branch on ``ok`` before committing a path.
|
|
153
|
+
See :data:`FIDELITY_CONTRACT` for what these gates prove vs. don't.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
ok: bool
|
|
157
|
+
reopen_ok: bool
|
|
158
|
+
package_ok: bool
|
|
159
|
+
document_ok: bool
|
|
160
|
+
editor_open_safety_ok: bool
|
|
161
|
+
id_integrity_ok: bool
|
|
162
|
+
idempotent: bool
|
|
163
|
+
sections_reconciled: bool = True
|
|
164
|
+
section_count: int = 0
|
|
165
|
+
paragraph_count: int = 0
|
|
166
|
+
byte_length: int = 0
|
|
167
|
+
reopen_error: str | None = None
|
|
168
|
+
serialize_error: str | None = None
|
|
169
|
+
idempotence: IdempotenceReport | None = None
|
|
170
|
+
reconcile: PackageReconcileReport | None = None
|
|
171
|
+
|
|
172
|
+
def to_dict(self) -> dict[str, Any]:
|
|
173
|
+
return {
|
|
174
|
+
"ok": self.ok,
|
|
175
|
+
"reopen_ok": self.reopen_ok,
|
|
176
|
+
"package_ok": self.package_ok,
|
|
177
|
+
"document_ok": self.document_ok,
|
|
178
|
+
"editor_open_safety_ok": self.editor_open_safety_ok,
|
|
179
|
+
"id_integrity_ok": self.id_integrity_ok,
|
|
180
|
+
"idempotent": self.idempotent,
|
|
181
|
+
"sections_reconciled": self.sections_reconciled,
|
|
182
|
+
"section_count": self.section_count,
|
|
183
|
+
"paragraph_count": self.paragraph_count,
|
|
184
|
+
"byte_length": self.byte_length,
|
|
185
|
+
"reopen_error": self.reopen_error,
|
|
186
|
+
"serialize_error": self.serialize_error,
|
|
187
|
+
"idempotence": (
|
|
188
|
+
None if self.idempotence is None else self.idempotence.to_dict()
|
|
189
|
+
),
|
|
190
|
+
"reconcile": (None if self.reconcile is None else self.reconcile.to_dict()),
|
|
191
|
+
"fidelity_contract": {
|
|
192
|
+
"proves": list(FIDELITY_CONTRACT["proves"]),
|
|
193
|
+
"does_not_prove": list(FIDELITY_CONTRACT["does_not_prove"]),
|
|
194
|
+
},
|
|
195
|
+
}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
"""Ranked roundtrip batch harness over an HWPX corpus.
|
|
3
|
+
|
|
4
|
+
Runs each sample through ``open -> serialize (round1) -> reopen -> serialize
|
|
5
|
+
(round2)`` and classifies it into a single ranked status:
|
|
6
|
+
|
|
7
|
+
PARSE_FAIL -> SERIALIZE_FAIL -> REPARSE_FAIL -> ROUND2_DIFF -> PASS
|
|
8
|
+
|
|
9
|
+
Hard failures (parse/serialize/reparse) are structural and fail CI (non-zero
|
|
10
|
+
exit). ``ROUND2_DIFF`` (the serializer is not a fixed point from its own output)
|
|
11
|
+
is *gradable* — reported but non-blocking — so a corpus can carry a known-
|
|
12
|
+
imperfect sample without masking a true structural regression. A separate
|
|
13
|
+
informational ``source_semantic_drift`` flag records whether the content
|
|
14
|
+
sequence changed from the ORIGINAL on first roundtrip (legitimate normalization
|
|
15
|
+
or a genuine loss; not a pass/fail by itself).
|
|
16
|
+
|
|
17
|
+
This gives a cheap, continuous, Hancom-free content-fidelity signal between the
|
|
18
|
+
expensive ComputerUse visual checks.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import sys
|
|
25
|
+
from collections import Counter
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from hwpx.document import HwpxDocument
|
|
30
|
+
from hwpx.tools.idempotence import check_idempotent_pair
|
|
31
|
+
from hwpx.tools.ir_equality import compare_documents_semantic
|
|
32
|
+
|
|
33
|
+
__all__ = ["SampleResult", "BatchReport", "classify_sample", "run_corpus", "main"]
|
|
34
|
+
|
|
35
|
+
_STATUS_RANK = {
|
|
36
|
+
"PARSE_FAIL": 0,
|
|
37
|
+
"SERIALIZE_FAIL": 1,
|
|
38
|
+
"REPARSE_FAIL": 2,
|
|
39
|
+
"ROUND2_DIFF": 3,
|
|
40
|
+
"PASS": 4,
|
|
41
|
+
}
|
|
42
|
+
_HARD_FAILS = {"PARSE_FAIL", "SERIALIZE_FAIL", "REPARSE_FAIL"}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class SampleResult:
|
|
47
|
+
sample: str
|
|
48
|
+
status: str
|
|
49
|
+
detail: str = ""
|
|
50
|
+
source_semantic_drift: bool = False
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def is_hard_fail(self) -> bool:
|
|
54
|
+
return self.status in _HARD_FAILS
|
|
55
|
+
|
|
56
|
+
def to_dict(self) -> dict[str, object]:
|
|
57
|
+
return {
|
|
58
|
+
"sample": self.sample,
|
|
59
|
+
"status": self.status,
|
|
60
|
+
"detail": self.detail,
|
|
61
|
+
"source_semantic_drift": self.source_semantic_drift,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class BatchReport:
|
|
67
|
+
results: list[SampleResult] = field(default_factory=list)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def counts(self) -> dict[str, int]:
|
|
71
|
+
return dict(Counter(r.status for r in self.results))
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def hard_fail_count(self) -> int:
|
|
75
|
+
return sum(1 for r in self.results if r.is_hard_fail)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def drift_count(self) -> int:
|
|
79
|
+
return sum(1 for r in self.results if r.source_semantic_drift)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def ok(self) -> bool:
|
|
83
|
+
return self.hard_fail_count == 0
|
|
84
|
+
|
|
85
|
+
def to_tsv(self) -> str:
|
|
86
|
+
lines = ["sample\tstatus\tsource_drift\tdetail"]
|
|
87
|
+
for r in self.results:
|
|
88
|
+
lines.append(f"{r.sample}\t{r.status}\t{int(r.source_semantic_drift)}\t{r.detail}")
|
|
89
|
+
return "\n".join(lines) + "\n"
|
|
90
|
+
|
|
91
|
+
def to_dict(self) -> dict[str, object]:
|
|
92
|
+
return {
|
|
93
|
+
"ok": self.ok,
|
|
94
|
+
"total": len(self.results),
|
|
95
|
+
"counts": self.counts,
|
|
96
|
+
"hardFailCount": self.hard_fail_count,
|
|
97
|
+
"sourceDriftCount": self.drift_count,
|
|
98
|
+
"results": [r.to_dict() for r in self.results],
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def classify_sample(path: str | Path) -> SampleResult:
|
|
103
|
+
name = Path(path).name
|
|
104
|
+
try:
|
|
105
|
+
source = Path(path).read_bytes()
|
|
106
|
+
doc1 = HwpxDocument.open(source)
|
|
107
|
+
except Exception as exc:
|
|
108
|
+
return SampleResult(name, "PARSE_FAIL", f"{type(exc).__name__}: {exc}")
|
|
109
|
+
try:
|
|
110
|
+
round1 = doc1.to_bytes()
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
return SampleResult(name, "SERIALIZE_FAIL", f"{type(exc).__name__}: {exc}")
|
|
113
|
+
try:
|
|
114
|
+
round2 = HwpxDocument.open(round1).to_bytes()
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
return SampleResult(name, "REPARSE_FAIL", f"{type(exc).__name__}: {exc}")
|
|
117
|
+
|
|
118
|
+
# Informational: did the content sequence change from the original?
|
|
119
|
+
drift = not compare_documents_semantic(source, round1).ok
|
|
120
|
+
|
|
121
|
+
idem = check_idempotent_pair(round1, round2)
|
|
122
|
+
if not idem.ok:
|
|
123
|
+
return SampleResult(name, "ROUND2_DIFF", idem.summary(), drift)
|
|
124
|
+
return SampleResult(name, "PASS", "", drift)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def run_corpus(
|
|
128
|
+
corpus_dir: str | Path, samples: list[str] | None = None
|
|
129
|
+
) -> BatchReport:
|
|
130
|
+
corpus = Path(corpus_dir)
|
|
131
|
+
if samples is None:
|
|
132
|
+
manifest = corpus / "manifest.json"
|
|
133
|
+
if manifest.exists():
|
|
134
|
+
samples = [s["file"] for s in json.loads(manifest.read_text("utf-8"))["samples"]]
|
|
135
|
+
else:
|
|
136
|
+
samples = sorted(p.name for p in corpus.glob("*.hwpx"))
|
|
137
|
+
report = BatchReport()
|
|
138
|
+
for sample in samples:
|
|
139
|
+
report.results.append(classify_sample(corpus / sample))
|
|
140
|
+
# Worst-first ordering for readability.
|
|
141
|
+
report.results.sort(key=lambda r: (_STATUS_RANK.get(r.status, 99), r.sample))
|
|
142
|
+
return report
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def main(argv: list[str] | None = None) -> int:
|
|
146
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
147
|
+
if not argv:
|
|
148
|
+
print("usage: roundtrip_batch <corpus_dir> [--tsv out.tsv] [--json out.json]", file=sys.stderr)
|
|
149
|
+
return 2
|
|
150
|
+
corpus_dir = argv[0]
|
|
151
|
+
tsv_out = json_out = None
|
|
152
|
+
if "--tsv" in argv:
|
|
153
|
+
tsv_out = argv[argv.index("--tsv") + 1]
|
|
154
|
+
if "--json" in argv:
|
|
155
|
+
json_out = argv[argv.index("--json") + 1]
|
|
156
|
+
|
|
157
|
+
report = run_corpus(corpus_dir)
|
|
158
|
+
if tsv_out:
|
|
159
|
+
Path(tsv_out).write_text(report.to_tsv(), "utf-8")
|
|
160
|
+
if json_out:
|
|
161
|
+
Path(json_out).write_text(json.dumps(report.to_dict(), ensure_ascii=False, indent=2), "utf-8")
|
|
162
|
+
|
|
163
|
+
print(
|
|
164
|
+
f"corpus={len(report.results)} counts={report.counts} "
|
|
165
|
+
f"hardFails={report.hard_fail_count} sourceDrift={report.drift_count}"
|
|
166
|
+
)
|
|
167
|
+
return 1 if report.hard_fail_count else 0
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__": # pragma: no cover
|
|
171
|
+
raise SystemExit(main())
|
hwpx/document.py
CHANGED
|
@@ -1819,6 +1819,9 @@ class HwpxDocument:
|
|
|
1819
1819
|
spacing_before_pt: float | None = None,
|
|
1820
1820
|
spacing_after_pt: float | None = None,
|
|
1821
1821
|
outline_level: int | None = None,
|
|
1822
|
+
keep_with_next: bool | None = None,
|
|
1823
|
+
keep_lines: bool | None = None,
|
|
1824
|
+
page_break_before: bool | None = None,
|
|
1822
1825
|
bottom_border: bool = False,
|
|
1823
1826
|
border_color: str = "#BFBFBF",
|
|
1824
1827
|
border_width: str = "0.12 mm",
|
|
@@ -1826,7 +1829,9 @@ class HwpxDocument:
|
|
|
1826
1829
|
"""Apply paragraph-level formatting using human units.
|
|
1827
1830
|
|
|
1828
1831
|
Millimetre inputs are converted to HWP units; paragraph spacing uses
|
|
1829
|
-
points; line spacing is stored as a percent value.
|
|
1832
|
+
points; line spacing is stored as a percent value. ``keep_with_next`` /
|
|
1833
|
+
``keep_lines`` / ``page_break_before`` set the paragraph's keep-together
|
|
1834
|
+
(``<hh:breakSetting>``) flags via a freshly minted paraPr.
|
|
1830
1835
|
"""
|
|
1831
1836
|
|
|
1832
1837
|
if not self._root.headers:
|
|
@@ -1858,12 +1863,21 @@ class HwpxDocument:
|
|
|
1858
1863
|
else:
|
|
1859
1864
|
raise ValueError("outline_level must be between 0 and 10")
|
|
1860
1865
|
|
|
1866
|
+
break_setting: dict[str, bool] = {}
|
|
1867
|
+
if keep_with_next is not None:
|
|
1868
|
+
break_setting["keep_with_next"] = bool(keep_with_next)
|
|
1869
|
+
if keep_lines is not None:
|
|
1870
|
+
break_setting["keep_lines"] = bool(keep_lines)
|
|
1871
|
+
if page_break_before is not None:
|
|
1872
|
+
break_setting["page_break_before"] = bool(page_break_before)
|
|
1873
|
+
|
|
1861
1874
|
if (
|
|
1862
1875
|
alignment is None
|
|
1863
1876
|
and line_spacing_percent is None
|
|
1864
1877
|
and not margins
|
|
1865
1878
|
and heading is None
|
|
1866
1879
|
and not bottom_border
|
|
1880
|
+
and not break_setting
|
|
1867
1881
|
):
|
|
1868
1882
|
raise ValueError("at least one paragraph formatting option is required")
|
|
1869
1883
|
|
|
@@ -1897,6 +1911,7 @@ class HwpxDocument:
|
|
|
1897
1911
|
margins=margins,
|
|
1898
1912
|
heading=heading,
|
|
1899
1913
|
border=border,
|
|
1914
|
+
break_setting=break_setting or None,
|
|
1900
1915
|
)
|
|
1901
1916
|
paragraph.para_pr_id_ref = para_pr_id
|
|
1902
1917
|
formatted.append({"paragraph_index": index, "paraPrIDRef": para_pr_id})
|
|
@@ -2446,8 +2461,11 @@ class HwpxDocument:
|
|
|
2446
2461
|
# 1) Write image bytes into the package
|
|
2447
2462
|
self._package.write(bin_data_path, image_data)
|
|
2448
2463
|
|
|
2449
|
-
# 2) Register in manifest
|
|
2450
|
-
|
|
2464
|
+
# 2) Register in manifest. ``isEmbeded="1"`` (OWPML's single-d spelling) marks
|
|
2465
|
+
# the BinData image as embedded — real Hancom drops the picture without it.
|
|
2466
|
+
self._package.add_manifest_item(
|
|
2467
|
+
item_id, bin_data_path, media_type, extra_attrs={"isEmbeded": "1"}
|
|
2468
|
+
)
|
|
2451
2469
|
|
|
2452
2470
|
# 3) Register in header binDataList
|
|
2453
2471
|
header = self._root.headers[0] if self._root.headers else None
|
hwpx/exam/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Exam re-typesetting (조판): authored exam Markdown -> school form .hwpx body."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .compose import ComposePlan, ComposeResult, ParaSpec, compose_exam_into_form, lower_exam, replace_body_region
|
|
5
|
+
from .ir import ExamDoc, Placeholder, Question, QuestionSet
|
|
6
|
+
from .measure import (
|
|
7
|
+
SplitReport,
|
|
8
|
+
column_x_bounds,
|
|
9
|
+
group_question_blocks,
|
|
10
|
+
measure_question_splits,
|
|
11
|
+
)
|
|
12
|
+
from .parser import ExamParseError, parse_exam_markdown
|
|
13
|
+
from .profile import FormProfile, FormProfileError, ResolvedStyle, profile_form
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ExamDoc", "Placeholder", "Question", "QuestionSet",
|
|
17
|
+
"ExamParseError", "parse_exam_markdown",
|
|
18
|
+
"FormProfile", "FormProfileError", "ResolvedStyle", "profile_form",
|
|
19
|
+
"column_x_bounds", "group_question_blocks", "measure_question_splits", "SplitReport",
|
|
20
|
+
"ParaSpec", "lower_exam", "replace_body_region", "ComposePlan",
|
|
21
|
+
"ComposeResult", "compose_exam_into_form",
|
|
22
|
+
]
|