python-hwpx 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hwpx/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+
2
+ """High-level utilities for working with HWPX documents."""
3
+
4
+ __version__ = "0.1.0"
5
+
6
+ from .tools.text_extractor import (
7
+ DEFAULT_NAMESPACES,
8
+ ParagraphInfo,
9
+ SectionInfo,
10
+ TextExtractor,
11
+ )
12
+ from .tools.object_finder import FoundElement, ObjectFinder
13
+
14
+ __all__ = [
15
+ "__version__",
16
+ "DEFAULT_NAMESPACES",
17
+ "ParagraphInfo",
18
+ "SectionInfo",
19
+ "TextExtractor",
20
+ "FoundElement",
21
+ "ObjectFinder",
22
+ ]
23
+
hwpx/document.py ADDED
@@ -0,0 +1,518 @@
1
+ """High-level representation of an HWPX document."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ import uuid
7
+ import xml.etree.ElementTree as ET
8
+
9
+ from os import PathLike
10
+ from typing import BinaryIO, Iterator, List, Tuple
11
+
12
+ from .oxml import (
13
+ HwpxOxmlDocument,
14
+ HwpxOxmlHeader,
15
+ HwpxOxmlInlineObject,
16
+ HwpxOxmlMemo,
17
+ HwpxOxmlParagraph,
18
+ HwpxOxmlRun,
19
+ HwpxOxmlSection,
20
+ HwpxOxmlTable,
21
+ MemoShape,
22
+ RunStyle,
23
+ )
24
+ from .package import HwpxPackage
25
+
26
+ _HP_NS = "http://www.hancom.co.kr/hwpml/2011/paragraph"
27
+ _HP = f"{{{_HP_NS}}}"
28
+
29
+
30
+ class HwpxDocument:
31
+ """Provides a user-friendly API for editing HWPX documents."""
32
+
33
+ def __init__(self, package: HwpxPackage, root: HwpxOxmlDocument):
34
+ self._package = package
35
+ self._root = root
36
+
37
+ # ------------------------------------------------------------------
38
+ # construction helpers
39
+ @classmethod
40
+ def open(
41
+ cls,
42
+ source: str | PathLike[str] | bytes | BinaryIO,
43
+ ) -> "HwpxDocument":
44
+ """Open *source* and return a :class:`HwpxDocument` instance."""
45
+ package = HwpxPackage.open(source)
46
+ root = HwpxOxmlDocument.from_package(package)
47
+ return cls(package, root)
48
+
49
+ @classmethod
50
+ def from_package(cls, package: HwpxPackage) -> "HwpxDocument":
51
+ """Create a document backed by an existing :class:`HwpxPackage`."""
52
+ root = HwpxOxmlDocument.from_package(package)
53
+ return cls(package, root)
54
+
55
+ # ------------------------------------------------------------------
56
+ # properties exposing document content
57
+ @property
58
+ def package(self) -> HwpxPackage:
59
+ """Return the :class:`HwpxPackage` backing this document."""
60
+ return self._package
61
+
62
+ @property
63
+ def oxml(self) -> HwpxOxmlDocument:
64
+ """Return the low-level XML object tree representing the document."""
65
+ return self._root
66
+
67
+ @property
68
+ def sections(self) -> List[HwpxOxmlSection]:
69
+ """Return the sections contained in the document."""
70
+ return self._root.sections
71
+
72
+ @property
73
+ def headers(self) -> List[HwpxOxmlHeader]:
74
+ """Return the header parts referenced by the document."""
75
+ return self._root.headers
76
+
77
+ @property
78
+ def memo_shapes(self) -> dict[str, MemoShape]:
79
+ """Return memo shapes available in the header reference lists."""
80
+
81
+ return self._root.memo_shapes
82
+
83
+ def memo_shape(self, memo_shape_id_ref: int | str | None) -> MemoShape | None:
84
+ """Return the memo shape definition referenced by *memo_shape_id_ref*."""
85
+
86
+ return self._root.memo_shape(memo_shape_id_ref)
87
+
88
+ @property
89
+ def memos(self) -> List[HwpxOxmlMemo]:
90
+ """Return all memo entries declared in every section."""
91
+
92
+ memos: List[HwpxOxmlMemo] = []
93
+ for section in self._root.sections:
94
+ memos.extend(section.memos)
95
+ return memos
96
+
97
+ def add_memo(
98
+ self,
99
+ text: str = "",
100
+ *,
101
+ section: HwpxOxmlSection | None = None,
102
+ section_index: int | None = None,
103
+ memo_shape_id_ref: str | int | None = None,
104
+ memo_id: str | None = None,
105
+ char_pr_id_ref: str | int | None = None,
106
+ attributes: dict[str, str] | None = None,
107
+ ) -> HwpxOxmlMemo:
108
+ """Create a memo entry inside *section* (or the last section by default)."""
109
+
110
+ if section is None and section_index is not None:
111
+ section = self._root.sections[section_index]
112
+ if section is None:
113
+ if not self._root.sections:
114
+ raise ValueError("document does not contain any sections")
115
+ section = self._root.sections[-1]
116
+ return section.add_memo(
117
+ text,
118
+ memo_shape_id_ref=memo_shape_id_ref,
119
+ memo_id=memo_id,
120
+ char_pr_id_ref=char_pr_id_ref,
121
+ attributes=attributes,
122
+ )
123
+
124
+ def remove_memo(self, memo: HwpxOxmlMemo) -> None:
125
+ """Remove *memo* from the section it belongs to."""
126
+
127
+ memo.remove()
128
+
129
+ def attach_memo_field(
130
+ self,
131
+ paragraph: HwpxOxmlParagraph,
132
+ memo: HwpxOxmlMemo,
133
+ *,
134
+ field_id: str | None = None,
135
+ author: str | None = None,
136
+ created: datetime | str | None = None,
137
+ number: int = 1,
138
+ char_pr_id_ref: str | int | None = None,
139
+ ) -> str:
140
+ """Attach a MEMO field control to *paragraph* so Hangul shows *memo*."""
141
+
142
+ if paragraph.section is None:
143
+ raise ValueError("paragraph must belong to a section before anchoring a memo")
144
+ if memo.group.section is None:
145
+ raise ValueError("memo is not attached to a section")
146
+
147
+ field_value = field_id or uuid.uuid4().hex
148
+ author_value = author or memo.attributes.get("author") or ""
149
+
150
+ created_value = created if created is not None else memo.attributes.get("createDateTime")
151
+ if isinstance(created_value, datetime):
152
+ created_value = created_value.strftime("%Y-%m-%d %H:%M:%S")
153
+ elif created_value is None:
154
+ created_value = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
155
+ else:
156
+ created_value = str(created_value)
157
+
158
+ memo_shape_id = memo.memo_shape_id_ref or ""
159
+
160
+ char_ref = char_pr_id_ref
161
+ if char_ref is None:
162
+ char_ref = paragraph.char_pr_id_ref
163
+ if char_ref is None:
164
+ char_ref = memo._infer_char_pr_id_ref()
165
+ if char_ref is None:
166
+ char_ref = "0"
167
+ char_ref = str(char_ref)
168
+
169
+ run_begin = ET.Element(f"{_HP}run", {"charPrIDRef": char_ref})
170
+ ctrl_begin = ET.SubElement(run_begin, f"{_HP}ctrl")
171
+ field_begin = ET.SubElement(
172
+ ctrl_begin,
173
+ f"{_HP}fieldBegin",
174
+ {
175
+ "id": field_value,
176
+ "type": "MEMO",
177
+ "editable": "true",
178
+ "dirty": "false",
179
+ "fieldid": field_value,
180
+ },
181
+ )
182
+
183
+ parameters = ET.SubElement(field_begin, f"{_HP}parameters", {"count": "5", "name": ""})
184
+ ET.SubElement(parameters, f"{_HP}stringParam", {"name": "ID"}).text = memo.id or ""
185
+ ET.SubElement(parameters, f"{_HP}integerParam", {"name": "Number"}).text = str(max(1, number))
186
+ ET.SubElement(parameters, f"{_HP}stringParam", {"name": "CreateDateTime"}).text = created_value
187
+ ET.SubElement(parameters, f"{_HP}stringParam", {"name": "Author"}).text = author_value
188
+ ET.SubElement(parameters, f"{_HP}stringParam", {"name": "MemoShapeID"}).text = memo_shape_id
189
+
190
+ sub_list = ET.SubElement(
191
+ field_begin,
192
+ f"{_HP}subList",
193
+ {
194
+ "id": f"memo-field-{memo.id or field_value}",
195
+ "textDirection": "HORIZONTAL",
196
+ "lineWrap": "BREAK",
197
+ "vertAlign": "TOP",
198
+ },
199
+ )
200
+ sub_para = ET.SubElement(
201
+ sub_list,
202
+ f"{_HP}p",
203
+ {
204
+ "id": f"memo-field-{(memo.id or field_value)}-p",
205
+ "paraPrIDRef": "0",
206
+ "styleIDRef": "0",
207
+ "pageBreak": "0",
208
+ "columnBreak": "0",
209
+ "merged": "0",
210
+ },
211
+ )
212
+ sub_run = ET.SubElement(sub_para, f"{_HP}run", {"charPrIDRef": char_ref})
213
+ ET.SubElement(sub_run, f"{_HP}t").text = memo.id or field_value
214
+
215
+ run_end = ET.Element(f"{_HP}run", {"charPrIDRef": char_ref})
216
+ ctrl_end = ET.SubElement(run_end, f"{_HP}ctrl")
217
+ ET.SubElement(ctrl_end, f"{_HP}fieldEnd", {"beginIDRef": field_value, "fieldid": field_value})
218
+
219
+ paragraph.element.insert(0, run_begin)
220
+ paragraph.element.append(run_end)
221
+ paragraph.section.mark_dirty()
222
+
223
+ return field_value
224
+
225
+ def add_memo_with_anchor(
226
+ self,
227
+ text: str = "",
228
+ *,
229
+ paragraph: HwpxOxmlParagraph | None = None,
230
+ section: HwpxOxmlSection | None = None,
231
+ section_index: int | None = None,
232
+ paragraph_text: str | None = None,
233
+ memo_shape_id_ref: str | int | None = None,
234
+ memo_id: str | None = None,
235
+ char_pr_id_ref: str | int | None = None,
236
+ attributes: dict[str, str] | None = None,
237
+ field_id: str | None = None,
238
+ author: str | None = None,
239
+ created: datetime | str | None = None,
240
+ number: int = 1,
241
+ anchor_char_pr_id_ref: str | int | None = None,
242
+ ) -> tuple[HwpxOxmlMemo, HwpxOxmlParagraph, str]:
243
+ """Create a memo and ensure it is visible by anchoring a MEMO field."""
244
+
245
+ memo = self.add_memo(
246
+ text,
247
+ section=section,
248
+ section_index=section_index,
249
+ memo_shape_id_ref=memo_shape_id_ref,
250
+ memo_id=memo_id,
251
+ char_pr_id_ref=char_pr_id_ref,
252
+ attributes=attributes,
253
+ )
254
+
255
+ target_paragraph = paragraph
256
+ if target_paragraph is None:
257
+ memo_section = memo.group.section
258
+ if memo_section is None:
259
+ raise ValueError("memo must belong to a section")
260
+ paragraph_value = "" if paragraph_text is None else paragraph_text
261
+ anchor_char = anchor_char_pr_id_ref or char_pr_id_ref
262
+ target_paragraph = self.add_paragraph(
263
+ paragraph_value,
264
+ section=memo_section,
265
+ char_pr_id_ref=anchor_char,
266
+ )
267
+ elif paragraph_text is not None:
268
+ target_paragraph.text = paragraph_text
269
+
270
+ field_value = self.attach_memo_field(
271
+ target_paragraph,
272
+ memo,
273
+ field_id=field_id,
274
+ author=author,
275
+ created=created,
276
+ number=number,
277
+ char_pr_id_ref=anchor_char_pr_id_ref,
278
+ )
279
+
280
+ return memo, target_paragraph, field_value
281
+
282
+ @property
283
+ def paragraphs(self) -> List[HwpxOxmlParagraph]:
284
+ """Return all paragraphs across every section."""
285
+ return self._root.paragraphs
286
+
287
+ @property
288
+ def char_properties(self) -> dict[str, RunStyle]:
289
+ """Return the resolved character style definitions available to the document."""
290
+
291
+ return self._root.char_properties
292
+
293
+ def char_property(self, char_pr_id_ref: int | str | None) -> RunStyle | None:
294
+ """Return the style referenced by *char_pr_id_ref* if known."""
295
+
296
+ return self._root.char_property(char_pr_id_ref)
297
+
298
+ def iter_runs(self) -> Iterator[HwpxOxmlRun]:
299
+ """Yield every run element contained in the document."""
300
+
301
+ for paragraph in self.paragraphs:
302
+ for run in paragraph.runs:
303
+ yield run
304
+
305
+ def find_runs_by_style(
306
+ self,
307
+ *,
308
+ text_color: str | None = None,
309
+ underline_type: str | None = None,
310
+ underline_color: str | None = None,
311
+ char_pr_id_ref: str | int | None = None,
312
+ ) -> List[HwpxOxmlRun]:
313
+ """Return runs matching the requested style criteria."""
314
+
315
+ matches: List[HwpxOxmlRun] = []
316
+ target_char = str(char_pr_id_ref).strip() if char_pr_id_ref is not None else None
317
+
318
+ for run in self.iter_runs():
319
+ if target_char is not None:
320
+ run_char = (run.char_pr_id_ref or "").strip()
321
+ if run_char != target_char:
322
+ continue
323
+ style = run.style
324
+ if text_color is not None:
325
+ if style is None or style.text_color() != text_color:
326
+ continue
327
+ if underline_type is not None:
328
+ if style is None or style.underline_type() != underline_type:
329
+ continue
330
+ if underline_color is not None:
331
+ if style is None or style.underline_color() != underline_color:
332
+ continue
333
+ matches.append(run)
334
+ return matches
335
+
336
+ def replace_text_in_runs(
337
+ self,
338
+ search: str,
339
+ replacement: str,
340
+ *,
341
+ text_color: str | None = None,
342
+ underline_type: str | None = None,
343
+ underline_color: str | None = None,
344
+ char_pr_id_ref: str | int | None = None,
345
+ limit: int | None = None,
346
+ ) -> int:
347
+ """Replace occurrences of *search* in runs matching the provided style filters."""
348
+
349
+ if not search:
350
+ raise ValueError("search must be a non-empty string")
351
+
352
+ replacements = 0
353
+ runs = self.find_runs_by_style(
354
+ text_color=text_color,
355
+ underline_type=underline_type,
356
+ underline_color=underline_color,
357
+ char_pr_id_ref=char_pr_id_ref,
358
+ )
359
+
360
+ for run in runs:
361
+ remaining = None
362
+ if limit is not None:
363
+ remaining = limit - replacements
364
+ if remaining <= 0:
365
+ break
366
+ replacements += run.replace_text(
367
+ search,
368
+ replacement,
369
+ count=remaining,
370
+ )
371
+ if limit is not None and replacements >= limit:
372
+ break
373
+ return replacements
374
+
375
+ # ------------------------------------------------------------------
376
+ # editing helpers
377
+ def add_paragraph(
378
+ self,
379
+ text: str = "",
380
+ *,
381
+ section: HwpxOxmlSection | None = None,
382
+ section_index: int | None = None,
383
+ para_pr_id_ref: str | int | None = None,
384
+ style_id_ref: str | int | None = None,
385
+ char_pr_id_ref: str | int | None = None,
386
+ run_attributes: dict[str, str] | None = None,
387
+ include_run: bool = True,
388
+ **extra_attrs: str,
389
+ ) -> HwpxOxmlParagraph:
390
+ """Append a paragraph to the document and return it.
391
+
392
+ Formatting references may be overridden via ``para_pr_id_ref``,
393
+ ``style_id_ref`` and ``char_pr_id_ref``. Any additional keyword
394
+ arguments are added as raw paragraph attributes.
395
+ """
396
+ return self._root.add_paragraph(
397
+ text,
398
+ section=section,
399
+ section_index=section_index,
400
+ para_pr_id_ref=para_pr_id_ref,
401
+ style_id_ref=style_id_ref,
402
+ char_pr_id_ref=char_pr_id_ref,
403
+ run_attributes=run_attributes,
404
+ include_run=include_run,
405
+ **extra_attrs,
406
+ )
407
+
408
+ def add_table(
409
+ self,
410
+ rows: int,
411
+ cols: int,
412
+ *,
413
+ section: HwpxOxmlSection | None = None,
414
+ section_index: int | None = None,
415
+ width: int | None = None,
416
+ height: int | None = None,
417
+ border_fill_id_ref: str | int = "0",
418
+ para_pr_id_ref: str | int | None = None,
419
+ style_id_ref: str | int | None = None,
420
+ char_pr_id_ref: str | int | None = None,
421
+ run_attributes: dict[str, str] | None = None,
422
+ **extra_attrs: str,
423
+ ) -> HwpxOxmlTable:
424
+ """Create a table in a new paragraph and return it."""
425
+
426
+ paragraph = self.add_paragraph(
427
+ "",
428
+ section=section,
429
+ section_index=section_index,
430
+ para_pr_id_ref=para_pr_id_ref,
431
+ style_id_ref=style_id_ref,
432
+ char_pr_id_ref=char_pr_id_ref,
433
+ include_run=False,
434
+ **extra_attrs,
435
+ )
436
+ return paragraph.add_table(
437
+ rows,
438
+ cols,
439
+ width=width,
440
+ height=height,
441
+ border_fill_id_ref=border_fill_id_ref,
442
+ run_attributes=run_attributes,
443
+ char_pr_id_ref=char_pr_id_ref,
444
+ )
445
+
446
+ def add_shape(
447
+ self,
448
+ shape_type: str,
449
+ *,
450
+ section: HwpxOxmlSection | None = None,
451
+ section_index: int | None = None,
452
+ attributes: dict[str, str] | None = None,
453
+ para_pr_id_ref: str | int | None = None,
454
+ style_id_ref: str | int | None = None,
455
+ char_pr_id_ref: str | int | None = None,
456
+ run_attributes: dict[str, str] | None = None,
457
+ **extra_attrs: str,
458
+ ) -> HwpxOxmlInlineObject:
459
+ """Insert an inline shape into a new paragraph."""
460
+
461
+ paragraph = self.add_paragraph(
462
+ "",
463
+ section=section,
464
+ section_index=section_index,
465
+ para_pr_id_ref=para_pr_id_ref,
466
+ style_id_ref=style_id_ref,
467
+ char_pr_id_ref=char_pr_id_ref,
468
+ include_run=False,
469
+ **extra_attrs,
470
+ )
471
+ return paragraph.add_shape(
472
+ shape_type,
473
+ attributes=attributes,
474
+ run_attributes=run_attributes,
475
+ char_pr_id_ref=char_pr_id_ref,
476
+ )
477
+
478
+ def add_control(
479
+ self,
480
+ *,
481
+ section: HwpxOxmlSection | None = None,
482
+ section_index: int | None = None,
483
+ attributes: dict[str, str] | None = None,
484
+ control_type: str | None = None,
485
+ para_pr_id_ref: str | int | None = None,
486
+ style_id_ref: str | int | None = None,
487
+ char_pr_id_ref: str | int | None = None,
488
+ run_attributes: dict[str, str] | None = None,
489
+ **extra_attrs: str,
490
+ ) -> HwpxOxmlInlineObject:
491
+ """Insert a control inline object into a new paragraph."""
492
+
493
+ paragraph = self.add_paragraph(
494
+ "",
495
+ section=section,
496
+ section_index=section_index,
497
+ para_pr_id_ref=para_pr_id_ref,
498
+ style_id_ref=style_id_ref,
499
+ char_pr_id_ref=char_pr_id_ref,
500
+ include_run=False,
501
+ **extra_attrs,
502
+ )
503
+ return paragraph.add_control(
504
+ attributes=attributes,
505
+ control_type=control_type,
506
+ run_attributes=run_attributes,
507
+ char_pr_id_ref=char_pr_id_ref,
508
+ )
509
+
510
+ def save(
511
+ self,
512
+ path_or_stream: str | PathLike[str] | BinaryIO | None = None,
513
+ ) -> str | PathLike[str] | BinaryIO | bytes | None:
514
+ """Persist pending changes to *path_or_stream* or the original source."""
515
+ updates = self._root.serialize()
516
+ result = self._package.save(path_or_stream, updates)
517
+ self._root.reset_dirty()
518
+ return result