libefiling 0.1.58__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {libefiling-0.1.58 → libefiling-0.2.0}/.gitignore +1 -0
  2. {libefiling-0.1.58 → libefiling-0.2.0}/PKG-INFO +23 -9
  3. {libefiling-0.1.58 → libefiling-0.2.0}/README.md +22 -7
  4. {libefiling-0.1.58 → libefiling-0.2.0}/docs/manifest.md +9 -12
  5. {libefiling-0.1.58 → libefiling-0.2.0}/pyproject.toml +20 -2
  6. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/__init__.py +1 -2
  7. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/archive/utils.py +6 -6
  8. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/default_config.py +4 -4
  9. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/manifest.py +55 -23
  10. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/parse.py +25 -25
  11. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/xml/kind.py +6 -0
  12. libefiling-0.1.58/docs/benchmark_process_images.py +0 -43
  13. libefiling-0.1.58/docs/benchmark_resize.py +0 -137
  14. libefiling-0.1.58/docs/benchmark_resize_isolated.py +0 -146
  15. libefiling-0.1.58/src/libefiling/xml/utils.py +0 -47
  16. {libefiling-0.1.58 → libefiling-0.2.0}/LICENSE +0 -0
  17. {libefiling-0.1.58 → libefiling-0.2.0}/docs/README.md +0 -0
  18. {libefiling-0.1.58 → libefiling-0.2.0}/docs/archive_structure_notes.md +0 -0
  19. {libefiling-0.1.58 → libefiling-0.2.0}/docs/file-1.png +0 -0
  20. {libefiling-0.1.58 → libefiling-0.2.0}/docs/file-2.png +0 -0
  21. {libefiling-0.1.58 → libefiling-0.2.0}/docs/file-3.png +0 -0
  22. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/archive/__init__.py +0 -0
  23. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/archive/aaa.py +0 -0
  24. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/archive/extract.py +0 -0
  25. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/archive/handler.py +0 -0
  26. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/archive/nnf.py +0 -0
  27. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/charset.py +0 -0
  28. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/cli.py +0 -0
  29. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/image/__init__.py +0 -0
  30. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/image/convert.py +0 -0
  31. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/image/kind.py +0 -0
  32. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/image/mediatype.py +0 -0
  33. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/image/ocr.py +0 -0
  34. {libefiling-0.1.58 → libefiling-0.2.0}/src/libefiling/image/params.py +0 -0
@@ -178,3 +178,4 @@ old
178
178
  output2
179
179
  ./images
180
180
  ./images
181
+ .envrc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: libefiling
3
- Version: 0.1.58
3
+ Version: 0.2.0
4
4
  Summary: A Python library for e-filing systems.
5
5
  Project-URL: Homepage, https://github.com/hyperion13th144m/libefiling
6
6
  Project-URL: Repository, https://github.com/hyperion13th144m/libefiling
@@ -13,7 +13,6 @@ Classifier: Operating System :: POSIX :: Linux
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Requires-Python: >=3.12
15
15
  Requires-Dist: asn1crypto<2.0.0,>=1.5.1
16
- Requires-Dist: dotenv<0.10.0,>=0.9.9
17
16
  Requires-Dist: pillow>=12.1.1
18
17
  Requires-Dist: pydantic<3.0.0,>=2.12.5
19
18
  Requires-Dist: pytesseract<0.4.0,>=0.3.13
@@ -54,7 +53,7 @@ pip install libefiling
54
53
 
55
54
  ## 使い方
56
55
  ```python
57
- from libefiling import parse_archive, ImageConvertParam, generate_sha256, get_document_code, get_doc_id
56
+ from libefiling import parse_archive, ImageConvertParam, Source
58
57
 
59
58
  params = [
60
59
  ImageConvertParam(
@@ -89,8 +88,13 @@ OUT='output'
89
88
  ### "chemical-formulas", "figures", "equations", "tables", "other-images", "ALL"
90
89
  ### ]
91
90
  ocr_target = ["other-images"]
92
- doc_id = generate_sha256(SRC)
93
- if doc_id === '...':
91
+
92
+ # src のハッシュ値や文書コードを生成して、処理するか判定する例
93
+ source = Source.create(SRC)
94
+ document_code = source.get_document_code()
95
+ if document_code not in ['A163', 'A151']:
96
+ raise ValueError(f"Unsupported document code: {document_code}")
97
+ if source.sha256 == '...'
94
98
  print("Already processed")
95
99
  else:
96
100
  parse_archive(
@@ -102,8 +106,7 @@ else:
102
106
  image_max_workers=0, # 0: CPU数に応じて自動
103
107
  )
104
108
 
105
- print(get_document_code("output/manifest.json"))
106
- print(get_doc_id("output/manifest.json"))
109
+
107
110
  ```
108
111
  - generate_sha256 はアーカイブの内容に応じたハッシュ値を生成し、再処理判定用に使える。
109
112
  - parse_archive は SRC,PROCを OUTに展開する。第4引数に、画像変換のパラメータを渡せる。
@@ -112,8 +115,7 @@ OUT に各種ファイルが展開される。第5引数はOCR処理対象の画
112
115
  - image_max_workers が 1 のとき: シリアル実行
113
116
  - image_max_workers が 2 以上のとき: スレッド並列実行
114
117
  - image_max_workers が 0 のとき: CPU数ベースで自動設定
115
- - get_document_code parse_archive で生成された manifest.json のパスを与えると、文書コード(e.g. A163)を返す。
116
- - get_doc_id は parse_archive で生成された manifest.json のパスを与えると、doc_id を返す。
118
+ - source = Source.create(SRC) source は、manifest.json, xml/sources.xml の内容とおなじ。parse_archive するまえに、source.sha256 を得られるということ。
117
119
 
118
120
  ### 画像変換の高速化オプション
119
121
  既定では Pillow でリサイズします。環境変数 LIBEFILING_RESIZER_BACKEND を指定すると、
@@ -193,3 +195,15 @@ MIT ライセンス
193
195
 
194
196
  0.1.56
195
197
  - 画像リサイズのために pillow-simd を選択できるようにした。
198
+
199
+ 0.1.60
200
+ - get_document_code 関数は、manifest.jsonだけでなく、アーカイブパス・手続ファイルを与えても文書コードを返すようにした。
201
+ - manifest.json に 文書コードを含めた
202
+
203
+ 0.2.0
204
+ - manifest.json の documents フィールドを sources フィールドに変更した。
205
+ - sources の子要素は配列でなく archive, procedure とした。
206
+ - sources.document_code フィールドは、文書コードを表す
207
+ - get_document_code 廃止,Source クラスの get_document_code で代替
208
+ - get_doc_id, generate_sha256 関数廃止, Source クラスの sha256 で代替
209
+ - xml/sources.xml をはき出すようにした. manifest.json の sources フィールドと同じ内容を表す。
@@ -33,7 +33,7 @@ pip install libefiling
33
33
 
34
34
  ## 使い方
35
35
  ```python
36
- from libefiling import parse_archive, ImageConvertParam, generate_sha256, get_document_code, get_doc_id
36
+ from libefiling import parse_archive, ImageConvertParam, Source
37
37
 
38
38
  params = [
39
39
  ImageConvertParam(
@@ -68,8 +68,13 @@ OUT='output'
68
68
  ### "chemical-formulas", "figures", "equations", "tables", "other-images", "ALL"
69
69
  ### ]
70
70
  ocr_target = ["other-images"]
71
- doc_id = generate_sha256(SRC)
72
- if doc_id === '...':
71
+
72
+ # src のハッシュ値や文書コードを生成して、処理するか判定する例
73
+ source = Source.create(SRC)
74
+ document_code = source.get_document_code()
75
+ if document_code not in ['A163', 'A151']:
76
+ raise ValueError(f"Unsupported document code: {document_code}")
77
+ if source.sha256 == '...'
73
78
  print("Already processed")
74
79
  else:
75
80
  parse_archive(
@@ -81,8 +86,7 @@ else:
81
86
  image_max_workers=0, # 0: CPU数に応じて自動
82
87
  )
83
88
 
84
- print(get_document_code("output/manifest.json"))
85
- print(get_doc_id("output/manifest.json"))
89
+
86
90
  ```
87
91
  - generate_sha256 はアーカイブの内容に応じたハッシュ値を生成し、再処理判定用に使える。
88
92
  - parse_archive は SRC,PROCを OUTに展開する。第4引数に、画像変換のパラメータを渡せる。
@@ -91,8 +95,7 @@ OUT に各種ファイルが展開される。第5引数はOCR処理対象の画
91
95
  - image_max_workers が 1 のとき: シリアル実行
92
96
  - image_max_workers が 2 以上のとき: スレッド並列実行
93
97
  - image_max_workers が 0 のとき: CPU数ベースで自動設定
94
- - get_document_code parse_archive で生成された manifest.json のパスを与えると、文書コード(e.g. A163)を返す。
95
- - get_doc_id は parse_archive で生成された manifest.json のパスを与えると、doc_id を返す。
98
+ - source = Source.create(SRC) source は、manifest.json, xml/sources.xml の内容とおなじ。parse_archive するまえに、source.sha256 を得られるということ。
96
99
 
97
100
  ### 画像変換の高速化オプション
98
101
  既定では Pillow でリサイズします。環境変数 LIBEFILING_RESIZER_BACKEND を指定すると、
@@ -172,3 +175,15 @@ MIT ライセンス
172
175
 
173
176
  0.1.56
174
177
  - 画像リサイズのために pillow-simd を選択できるようにした。
178
+
179
+ 0.1.60
180
+ - get_document_code 関数は、manifest.jsonだけでなく、アーカイブパス・手続ファイルを与えても文書コードを返すようにした。
181
+ - manifest.json に 文書コードを含めた
182
+
183
+ 0.2.0
184
+ - manifest.json の documents フィールドを sources フィールドに変更した。
185
+ - sources の子要素は配列でなく archive, procedure とした。
186
+ - sources.document_code フィールドは、文書コードを表す
187
+ - get_document_code 廃止,Source クラスの get_document_code で代替
188
+ - get_doc_id, generate_sha256 関数廃止, Source クラスの sha256 で代替
189
+ - xml/sources.xml をはき出すようにした. manifest.json の sources フィールドと同じ内容を表す。
@@ -40,7 +40,7 @@ manifest.json は、次の設計方針に基づいている。
40
40
  {
41
41
  "manifest_version": "1.0.0",
42
42
  "generator": { ... },
43
- "document": { ... },
43
+ "sources": { ... },
44
44
  "paths": { ... },
45
45
  "xml_files": [ ... ],
46
46
  "images": [ ... ],
@@ -71,12 +71,11 @@ manifest.json は、次の設計方針に基づいている。
71
71
  - 再現性やデバッグのために使用される
72
72
 
73
73
 
74
- ## 4.3 document
74
+ ## 4.3 sources
75
75
  ```json
76
- "document": {
77
- "doc_id": "D000001",
78
- "sources": [
79
- {
76
+ "sources": {
77
+ "document_code": "A163",
78
+ "archive": {
80
79
  "filename": "...AAA.JWX",
81
80
  "sha256": "...",
82
81
  "byte_size": 12345678,
@@ -84,7 +83,7 @@ manifest.json は、次の設計方針に基づいている。
84
83
  "kind": "AA",
85
84
  "extension": ".JWX"
86
85
  },
87
- {
86
+ "procedure": {
88
87
  "filename": "...AFM.XML",
89
88
  "sha256": "...",
90
89
  "byte_size": 4220,
@@ -92,13 +91,12 @@ manifest.json は、次の設計方針に基づいている。
92
91
  "kind": "FM",
93
92
  "extension": ".XML"
94
93
  }
95
- ]
96
94
  }
97
95
  ```
98
96
 
99
- - doc_id は、この文書単位を一意に識別するためのID
100
- - source は基になったファイルに関する情報
101
- - archive_sha256 は再処理判定や追跡用
97
+ - document_code は、文書の分類コード
98
+ - archive, procedure は基になったファイルに関する情報
99
+ - sha256 はarchive, procedure のファイル内容に基づいて生成されたハッシュ値。処理済みかどうかの判定に使える
102
100
  - task, kind, extension はファイル名から得られるアーカイブの属する業務、種類、拡張子
103
101
  - task の値は以下の通り
104
102
  - A: 出願
@@ -116,7 +114,6 @@ manifest.json は、次の設計方針に基づいている。
116
114
  - ER: 緊急避難用送信ファイル
117
115
  - FM: 手続情報管理ファイル
118
116
  - XX: 不明(上記に当てはまらない場合)
119
- - procedure_source は手続き情報ファイルに関する情報
120
117
 
121
118
  ### 4.4 paths
122
119
  ```json
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "libefiling"
3
- version = "0.1.58"
3
+ version = "0.2.0"
4
4
  description = "A Python library for e-filing systems."
5
5
  authors = [{ name = "hyperion13th144m", email = "hyperion13th144m@gmail.com" }]
6
6
  requires-python = ">=3.12"
@@ -16,7 +16,6 @@ dependencies = [
16
16
  "asn1crypto (>=1.5.1,<2.0.0)",
17
17
  "pytesseract (>=0.3.13,<0.4.0)",
18
18
  "pydantic (>=2.12.5,<3.0.0)",
19
- "dotenv (>=0.9.9,<0.10.0)",
20
19
  "pillow>=12.1.1",
21
20
  ]
22
21
 
@@ -42,3 +41,22 @@ include = ["src/libefiling"]
42
41
  [build-system]
43
42
  requires = ["hatchling"]
44
43
  build-backend = "hatchling.build"
44
+
45
+ [tool.ruff.lint]
46
+ # 1. Enable flake8-bugbear (`B`) rules, in addition to the defaults.
47
+ select = ["E4", "E7", "E9", "F", "B"]
48
+
49
+ # 2. Avoid enforcing line-length violations (`E501`)
50
+ ignore = ["E501"]
51
+
52
+ # 3. Avoid trying to fix flake8-bugbear (`B`) violations.
53
+ unfixable = ["B"]
54
+
55
+ # 4. Ignore `E402` (import violations) in all `__init__.py` files, and in selected subdirectories.
56
+ [tool.ruff.lint.per-file-ignores]
57
+ "**/{tests,docs,tools}/*" = ["E402"]
58
+ "__init__.py" = ["E402"]
59
+
60
+ [tool.ruff.format]
61
+ # 5. Use double quotes in `ruff format`.
62
+ quote-style = "double"
@@ -1,5 +1,4 @@
1
1
  from .archive.utils import generate_sha256
2
2
  from .image.params import ImageConvertParam
3
- from .manifest import Manifest, get_doc_id
3
+ from .manifest import Manifest, Source
4
4
  from .parse import parse_archive
5
- from .xml.utils import get_document_code
@@ -2,19 +2,19 @@ import hashlib
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- def generate_sha256(archive_path: str | Path) -> str:
6
- """return document sha256 based on archive_path content
5
+ def generate_sha256(file_path: str | Path) -> str:
6
+ """return document sha256 based on file_path content
7
7
 
8
8
  Args:
9
- archive_path (str | Path): archive path
9
+ file_path (str | Path): file path
10
10
 
11
11
  Returns:
12
12
  str: document sha256
13
13
  """
14
14
  sha256_hash = hashlib.sha256()
15
- if isinstance(archive_path, Path):
16
- archive_path = str(archive_path)
17
- with open(archive_path, "rb") as f:
15
+ if isinstance(file_path, Path):
16
+ file_path = str(file_path)
17
+ with open(file_path, "rb") as f:
18
18
  # Read and update hash string value in blocks of 4K
19
19
  for byte_block in iter(lambda: f.read(4096), b""):
20
20
  sha256_hash.update(byte_block)
@@ -1,4 +1,4 @@
1
- from .image.params import ImageConvertParam
1
+ from .image.params import ImageAttribute, ImageConvertParam
2
2
 
3
3
  defaultImageParams = [
4
4
  ImageConvertParam(
@@ -6,20 +6,20 @@ defaultImageParams = [
6
6
  height=300,
7
7
  suffix="-thumbnail",
8
8
  format=".webp",
9
- attributes=[{"key": "sizeTag", "value": "thumbnail"}],
9
+ attributes=[ImageAttribute(key="sizeTag", value="thumbnail")],
10
10
  ),
11
11
  ImageConvertParam(
12
12
  width=600,
13
13
  height=600,
14
14
  suffix="-middle",
15
15
  format=".webp",
16
- attributes=[{"key": "sizeTag", "value": "middle"}],
16
+ attributes=[ImageAttribute(key="sizeTag", value="middle")],
17
17
  ),
18
18
  ImageConvertParam(
19
19
  width=800,
20
20
  height=0,
21
21
  suffix="-large",
22
22
  format=".webp",
23
- attributes=[{"key": "sizeTag", "value": "large"}],
23
+ attributes=[ImageAttribute(key="sizeTag", value="large")],
24
24
  ),
25
25
  ]
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
- from enum import Enum
5
4
  from pathlib import Path
6
- from typing import List, Literal, Optional, get_args
5
+ from typing import List, Optional
6
+ from xml.etree import ElementTree as ET
7
7
 
8
8
  from pydantic import BaseModel, Field
9
9
 
10
+ from libefiling.archive.utils import generate_sha256
10
11
  from libefiling.image.kind import IMAGE_KIND
11
12
  from libefiling.xml.kind import XML_KIND
12
13
 
@@ -30,13 +31,14 @@ class Source(BaseModel):
30
31
  extension: str
31
32
 
32
33
  @classmethod
33
- def create(cls, file_path: str, sha256: str) -> Source:
34
+ def create(cls, file_path: str | Path) -> Source:
34
35
  """Create Source from file path
35
36
 
36
37
  Args:
37
- file_path (str): file path
38
+ file_path (str | Path): file path
38
39
  """
39
40
  filename = Path(file_path).name
41
+ sha256 = generate_sha256(file_path)
40
42
  byte_size = Path(file_path).stat().st_size
41
43
  if len(filename) == 63:
42
44
  task = filename[56 : 56 + 1]
@@ -54,10 +56,55 @@ class Source(BaseModel):
54
56
  extension=extension,
55
57
  )
56
58
 
59
+ def get_document_code(self) -> str:
60
+ """Get document code from archive file name
61
+
62
+ Args:
63
+ Returns:
64
+ str: document code (e.g. A163) or None if not found
65
+ """
66
+ if len(self.filename) < 29:
67
+ return "UNKNOWN"
68
+ else:
69
+ return self.filename[19 : 19 + 9].replace("_", "").strip()
57
70
 
58
- class DocumentInfo(BaseModel):
59
- doc_id: str
60
- sources: List[Source]
71
+
72
+ class Sources(BaseModel):
73
+ document_code: str
74
+ archive: Source
75
+ procedure: Source
76
+
77
+ def save_as_xml(self, xml_path: str) -> None:
78
+ """Save Sources as XML file
79
+
80
+ Args:
81
+ xml_path (str): XML file path to save
82
+ """
83
+ root = ET.Element("sources", attrib={"document-code": self.document_code})
84
+ for source in [self.archive, self.procedure]:
85
+ ET.SubElement(
86
+ root,
87
+ "source",
88
+ attrib={
89
+ "filename": source.filename,
90
+ "sha256": source.sha256,
91
+ "byte-size": str(source.byte_size),
92
+ "task": source.task,
93
+ "kind": source.kind,
94
+ "extension": source.extension,
95
+ },
96
+ )
97
+ tree = ET.ElementTree(root)
98
+ tree.write(xml_path, encoding="utf-8", xml_declaration=True)
99
+
100
+ def to_xml_file(self, xml_path: str) -> XmlFile:
101
+ return XmlFile(
102
+ filename=Path(xml_path).name,
103
+ original_filename=None,
104
+ sha256=generate_sha256(xml_path),
105
+ encoding=EncodingInfo(detected="UTF-8", normalized_to="UTF-8"),
106
+ kind="source",
107
+ )
61
108
 
62
109
 
63
110
  # -------------------------
@@ -151,23 +198,8 @@ class Stats(BaseModel):
151
198
  class Manifest(BaseModel):
152
199
  manifest_version: str = "1.0.0"
153
200
  generator: GeneratorInfo
154
- document: DocumentInfo
201
+ sources: Sources
155
202
  paths: Paths = Paths()
156
203
  xml_files: List[XmlFile] = []
157
204
  images: List[ImageEntry] = []
158
205
  stats: Stats
159
- images: List[ImageEntry] = []
160
- stats: Stats
161
-
162
-
163
- def get_doc_id(manifest_path: str) -> str | None:
164
- """Get document ID from manifest file
165
-
166
- Args:
167
- manifest_path (str): manifest file path (e.g. manifest.json)
168
- Returns:
169
- str: document ID (e.g. 2024000000000)
170
- """
171
- mp = Path(manifest_path)
172
- manifest = Manifest.model_validate_json(mp.read_text(encoding="utf-8"))
173
- return manifest.document.doc_id.strip() if manifest.document.doc_id else None
@@ -1,18 +1,16 @@
1
1
  import os
2
- import shutil
3
2
  from concurrent.futures import ThreadPoolExecutor
4
3
  from datetime import datetime
5
4
  from importlib.metadata import version as get_version
6
5
  from itertools import chain
7
6
  from pathlib import Path
8
- from typing import Iterable, Iterator, List, Literal, Union, get_args
7
+ from typing import Iterable, Iterator, List
9
8
 
10
9
  from libefiling.archive.utils import generate_sha256
11
10
  from libefiling.image.kind import OCR_TARGET, detect_image_kind
12
11
  from libefiling.image.mediatype import get_media_type
13
12
  from libefiling.manifest import (
14
13
  DerivedImage,
15
- DocumentInfo,
16
14
  EncodingInfo,
17
15
  GeneratorInfo,
18
16
  ImageAttributes,
@@ -20,6 +18,7 @@ from libefiling.manifest import (
20
18
  Manifest,
21
19
  OcrInfo,
22
20
  Source,
21
+ Sources,
23
22
  Stats,
24
23
  XmlFile,
25
24
  )
@@ -69,7 +68,8 @@ def parse_archive(
69
68
  xml_files = process_xml(raw_xml_files, xml_dir)
70
69
 
71
70
  ### convert charset of procedure xml to UTF-8 and save to xml_dir
72
- xml_files.append(process_procedure_xml(Path(src_procedure_path), xml_dir))
71
+ proc_xml_path = xml_dir / "procedure.xml"
72
+ xml_files.append(process_procedure_xml(Path(src_procedure_path), proc_xml_path))
73
73
 
74
74
  ### guess language
75
75
  lang = guess_language_by_filename(str(xml_dir))
@@ -89,10 +89,21 @@ def parse_archive(
89
89
  max_workers=image_max_workers,
90
90
  )
91
91
 
92
+ ### generate sources.xml
93
+ source_archive = Source.create(src_archive_path)
94
+ source_proc = Source.create(src_procedure_path)
95
+ sources = Sources(
96
+ document_code=source_archive.get_document_code(),
97
+ archive=source_archive,
98
+ procedure=source_proc,
99
+ )
100
+ sources_xml_path = str(xml_dir / "sources.xml")
101
+ sources.save_as_xml(sources_xml_path)
102
+ xml_files.append(sources.to_xml_file(sources_xml_path))
103
+
92
104
  # generate manifest
93
105
  manifest = process_manifest(
94
- src_archive_path,
95
- src_procedure_path,
106
+ sources,
96
107
  str(xml_dir),
97
108
  xml_files,
98
109
  images,
@@ -148,16 +159,14 @@ def process_xml(
148
159
 
149
160
  def process_procedure_xml(
150
161
  src_procedure_path: Path,
151
- xml_dir: Path,
152
- filename: str = "procedure.xml",
162
+ xml_path: Path,
153
163
  ) -> XmlFile:
154
- xml_path = xml_dir / filename
155
164
  convert_xml_charset(str(src_procedure_path), str(xml_path))
156
165
  return XmlFile(
157
- filename=filename,
166
+ filename=xml_path.name,
158
167
  encoding=EncodingInfo(detected="shift_jis", normalized_to="UTF-8"),
159
168
  sha256=generate_sha256(xml_path),
160
- kind=detect_xml_kind(filename),
169
+ kind=detect_xml_kind(xml_path.name),
161
170
  )
162
171
 
163
172
 
@@ -177,7 +186,9 @@ def process_images(
177
186
  workers = _resolve_worker_count(max_workers)
178
187
  if workers <= 1 or len(image_list) == 1:
179
188
  return [
180
- _process_single_image(image, images_dir, ocr_dir, image_params, lang, ocr_target)
189
+ _process_single_image(
190
+ image, images_dir, ocr_dir, image_params, lang, ocr_target
191
+ )
181
192
  for image in image_list
182
193
  ]
183
194
 
@@ -275,8 +286,7 @@ def get_ocr_text(image: Path, ocr_dir: Path, lang: str) -> OcrInfo:
275
286
 
276
287
 
277
288
  def process_manifest(
278
- src_archive_path: str,
279
- src_procedure_path: str,
289
+ sources: Sources,
280
290
  xml_dir: str,
281
291
  xml_files: list[XmlFile],
282
292
  images: list[ImageEntry],
@@ -287,17 +297,7 @@ def process_manifest(
287
297
  version=get_version("libefiling"),
288
298
  created_at=datetime.now(),
289
299
  ),
290
- document=DocumentInfo(
291
- doc_id=generate_sha256(src_archive_path),
292
- sources=[
293
- Source.create(
294
- src_archive_path, sha256=generate_sha256(src_archive_path)
295
- ),
296
- Source.create(
297
- src_procedure_path, sha256=generate_sha256(src_procedure_path)
298
- ),
299
- ],
300
- ),
300
+ sources=sources,
301
301
  xml_files=xml_files,
302
302
  images=images,
303
303
  stats=Stats(
@@ -23,6 +23,7 @@ XML_KIND = Literal[
23
23
  "special-attached-documents",
24
24
  "special-st26-sequence-list",
25
25
  "procedure",
26
+ "source",
26
27
  "unknown",
27
28
  ]
28
29
 
@@ -139,6 +140,11 @@ re_xml: list[XML_RE_MAP] = [
139
140
  "regex": re.compile(r"procedure\.xml"),
140
141
  "description": "procedure XML procedure.xml",
141
142
  },
143
+ {
144
+ "kind": "source",
145
+ "regex": re.compile(r"source\.xml"),
146
+ "description": "source XML source.xml",
147
+ },
142
148
  ]
143
149
 
144
150
 
@@ -1,43 +0,0 @@
1
- from pathlib import Path
2
- from time import perf_counter
3
-
4
- from libefiling.default_config import defaultImageParams
5
- from libefiling.parse import process_images
6
-
7
- # Benchmark target: images/ 以下の tif 画像
8
- image_files = sorted(Path("images").rglob("*.tif"))
9
- if not image_files:
10
- raise SystemExit("No *.tif files found under images/")
11
-
12
- sample_size = min(120, len(image_files))
13
- sample = image_files[:sample_size]
14
- print(f"sample: {sample_size} files")
15
-
16
-
17
- def run_case(max_workers: int | None, out_root: Path) -> float:
18
- out_images = out_root / "images"
19
- out_ocr = out_root / "ocr"
20
- out_images.mkdir(parents=True, exist_ok=True)
21
- out_ocr.mkdir(parents=True, exist_ok=True)
22
-
23
- start = perf_counter()
24
- result = process_images(
25
- sample,
26
- out_images,
27
- out_ocr,
28
- defaultImageParams,
29
- "jpn",
30
- None,
31
- max_workers=max_workers,
32
- )
33
- elapsed = perf_counter() - start
34
- print(
35
- f"max_workers={max_workers} elapsed={elapsed:.3f}s items={len(result)}"
36
- )
37
- return elapsed
38
-
39
-
40
- serial_sec = run_case(1, Path("/tmp/libefiling-bench-serial"))
41
- parallel_sec = run_case(0, Path("/tmp/libefiling-bench-auto"))
42
- print(f"speedup: {serial_sec / parallel_sec:.3f}x")
43
- print(f"time_reduction: {(serial_sec - parallel_sec) / serial_sec * 100:.2f}%")
@@ -1,137 +0,0 @@
1
- """Benchmark the current PIL implementation in the active environment."""
2
-
3
- import argparse
4
- import importlib.metadata
5
- import json
6
- import statistics
7
- from pathlib import Path
8
- from time import perf_counter
9
-
10
- import libefiling.image.convert as conv
11
- from libefiling.default_config import defaultImageParams
12
- from libefiling.image.convert import get_size, load_image, resize_image
13
-
14
-
15
- def parse_args() -> argparse.Namespace:
16
- parser = argparse.ArgumentParser()
17
- parser.add_argument("--sample-size", type=int, default=120)
18
- parser.add_argument("--repeats", type=int, default=3)
19
- parser.add_argument("--backend", default="pillow")
20
- parser.add_argument("--json", action="store_true")
21
- return parser.parse_args()
22
-
23
-
24
- args = parse_args()
25
-
26
- # --------------------------------------------------------------------------- #
27
- # 画像ファイルの収集
28
- # --------------------------------------------------------------------------- #
29
- DATA_ROOT = Path("images/var/data")
30
- image_files = sorted(DATA_ROOT.rglob("*.tif"))
31
- if not image_files:
32
- raise SystemExit(f"No *.tif files found under {DATA_ROOT}")
33
-
34
- MAX_SAMPLES = 120
35
- sample_files = image_files[: min(args.sample_size, len(image_files))]
36
- print(f"Total *.tif: {len(image_files)} → using {len(sample_files)} files")
37
-
38
- # --------------------------------------------------------------------------- #
39
- # pillow-simd の有無を確認
40
- # --------------------------------------------------------------------------- #
41
- try:
42
- simd_ver = importlib.metadata.version("pillow-simd")
43
- print(f"pillow-simd : {simd_ver}")
44
- except importlib.metadata.PackageNotFoundError:
45
- simd_ver = None
46
- print("pillow-simd : not installed (pillow-simd backend は pillow にフォールバックします)")
47
-
48
- try:
49
- pillow_ver = importlib.metadata.version("Pillow")
50
- except importlib.metadata.PackageNotFoundError:
51
- pillow_ver = "unknown"
52
- print(f"Pillow : {pillow_ver}")
53
-
54
- if simd_ver is not None:
55
- print("NOTE: pillow-simd をインストールした環境では PIL 自体が pillow-simd 実装です。")
56
- print(" このスクリプト内の 'pillow' と 'pillow-simd' は別バイナリ比較ではなく、")
57
- print(" 同じ PIL 実装に対する別コードパス比較になります。")
58
- else:
59
- print("NOTE: pillow-simd 未導入環境では 'pillow-simd' backend は Pillow にフォールバックします。")
60
- print(" このスクリプト内の 'pillow' と 'pillow-simd' は別バイナリ比較にはなりません。")
61
-
62
- print(" 真の比較を行うには、Pillow 環境と pillow-simd 環境を分けて個別に実行してください。")
63
- print()
64
-
65
- # --------------------------------------------------------------------------- #
66
- # 画像を事前にメモリへロード(I/O をベンチから除外)
67
- # --------------------------------------------------------------------------- #
68
- print("Loading images into memory...", end=" ", flush=True)
69
- loaded_images = []
70
- for p in sample_files:
71
- try:
72
- loaded_images.append(load_image(p))
73
- except Exception as e:
74
- print(f"\nSkipping {p}: {e}")
75
- print(f"{len(loaded_images)} images loaded.")
76
- print()
77
-
78
- # リサイズターゲット(defaultImageParams の全サイズを使用)
79
- resize_targets = [(p.width, p.height) for p in defaultImageParams]
80
-
81
- # --------------------------------------------------------------------------- #
82
- # ベンチマーク本体
83
- # --------------------------------------------------------------------------- #
84
- REPEATS = args.repeats
85
-
86
-
87
- def run_benchmark(backend: str) -> list[float]:
88
- """指定バックエンドで全サンプルをリサイズし、1回あたりの経過秒のリストを返す。"""
89
- conv.RESIZER_BACKEND = backend
90
- times: list[float] = []
91
- for _ in range(REPEATS):
92
- t0 = perf_counter()
93
- for img in loaded_images:
94
- for w, h in resize_targets:
95
- size = get_size(img, w, h)
96
- resize_image(img, size)
97
- elapsed = perf_counter() - t0
98
- times.append(elapsed)
99
- return times
100
-
101
-
102
- print(f"[{args.backend}] running ({REPEATS} reps) ...", flush=True)
103
- times = run_benchmark(args.backend)
104
- best = min(times)
105
- avg = statistics.mean(times)
106
- ops = len(loaded_images) * len(resize_targets)
107
- throughput = ops / best
108
-
109
- print(f" best={best:.3f}s avg={avg:.3f}s ({ops} resize ops/rep)")
110
- print()
111
- print("=" * 50)
112
- print(" Summary")
113
- print("=" * 50)
114
- print(f" backend {args.backend}")
115
- print(f" pillow {pillow_ver}")
116
- print(f" pillow-simd {simd_ver or 'not installed'}")
117
- print(f" best {best:.3f}s")
118
- print(f" avg {avg:.3f}s")
119
- print(f" throughput {throughput:.0f} ops/s")
120
-
121
- if args.json:
122
- print(
123
- json.dumps(
124
- {
125
- "backend": args.backend,
126
- "pillow": pillow_ver,
127
- "pillow_simd": simd_ver,
128
- "sample_size": len(sample_files),
129
- "repeats": REPEATS,
130
- "ops_per_repeat": ops,
131
- "times": times,
132
- "best": best,
133
- "avg": avg,
134
- "throughput": throughput,
135
- }
136
- )
137
- )
@@ -1,146 +0,0 @@
1
- """Run an isolated Pillow vs pillow-simd vs cykooz benchmark in separate virtualenvs."""
2
-
3
- import argparse
4
- import json
5
- import shutil
6
- import subprocess
7
- import sys
8
- from pathlib import Path
9
-
10
- ROOT = Path(__file__).resolve().parents[1]
11
- PYTHON = sys.executable
12
- ENV_ROOT = ROOT / ".bench-envs"
13
-
14
- COMMON_DEPS = [
15
- "asn1crypto>=1.5.1,<2.0.0",
16
- "pytesseract>=0.3.13,<0.4.0",
17
- "pydantic>=2.12.5,<3.0.0",
18
- "dotenv>=0.9.9,<0.10.0",
19
- ]
20
-
21
- ENV_SPECS = [
22
- {
23
- "name": "pillow",
24
- "packages": ["pillow"],
25
- "backend": "pillow",
26
- },
27
- {
28
- "name": "pillow-simd",
29
- "packages": ["pillow-simd"],
30
- "backend": "pillow",
31
- },
32
- {
33
- "name": "cykooz",
34
- "packages": ["pillow", "cykooz_resizer"],
35
- "backend": "cykooz",
36
- },
37
- ]
38
-
39
-
40
- def parse_args() -> argparse.Namespace:
41
- parser = argparse.ArgumentParser()
42
- parser.add_argument("--sample-size", type=int, default=120)
43
- parser.add_argument("--repeats", type=int, default=3)
44
- parser.add_argument("--keep-envs", action="store_true")
45
- return parser.parse_args()
46
-
47
-
48
- def run(cmd: list[str], cwd: Path | None = None) -> subprocess.CompletedProcess[str]:
49
- return subprocess.run(
50
- cmd,
51
- cwd=cwd or ROOT,
52
- check=True,
53
- text=True,
54
- capture_output=True,
55
- )
56
-
57
-
58
- def env_python(env_dir: Path) -> Path:
59
- return env_dir / "bin" / "python"
60
-
61
-
62
- def setup_env(env_name: str, packages: list[str]) -> Path:
63
- env_dir = ENV_ROOT / env_name
64
- if env_dir.exists():
65
- shutil.rmtree(env_dir)
66
-
67
- print(f"[setup] {env_name}")
68
- run(["uv", "venv", str(env_dir), "--python", PYTHON])
69
- python_bin = env_python(env_dir)
70
-
71
- run(["uv", "pip", "install", "--python", str(python_bin), "-e", ".", "--no-deps"])
72
- run(["uv", "pip", "install", "--python", str(python_bin), *COMMON_DEPS, *packages])
73
- return env_dir
74
-
75
-
76
- def benchmark_env(
77
- env_name: str,
78
- env_dir: Path,
79
- backend: str,
80
- sample_size: int,
81
- repeats: int,
82
- ) -> dict:
83
- python_bin = env_python(env_dir)
84
- result = run(
85
- [
86
- str(python_bin),
87
- "docs/benchmark_resize.py",
88
- "--backend",
89
- backend,
90
- "--sample-size",
91
- str(sample_size),
92
- "--repeats",
93
- str(repeats),
94
- "--json",
95
- ]
96
- )
97
- print(result.stdout)
98
- payload = json.loads(result.stdout.strip().splitlines()[-1])
99
- payload["env_name"] = env_name
100
- return payload
101
-
102
-
103
- def main() -> int:
104
- args = parse_args()
105
- ENV_ROOT.mkdir(exist_ok=True)
106
-
107
- env_dirs: dict[str, Path] = {}
108
- results: list[dict] = []
109
-
110
- for spec in ENV_SPECS:
111
- env_dirs[spec["name"]] = setup_env(spec["name"], spec["packages"])
112
-
113
- try:
114
- for spec in ENV_SPECS:
115
- results.append(
116
- benchmark_env(
117
- spec["name"],
118
- env_dirs[spec["name"]],
119
- spec["backend"],
120
- args.sample_size,
121
- args.repeats,
122
- )
123
- )
124
- finally:
125
- if not args.keep_envs:
126
- shutil.rmtree(ENV_ROOT, ignore_errors=True)
127
-
128
- by_name = {result["env_name"]: result for result in results}
129
- pillow_result = by_name["pillow"]
130
- fastest = min(results, key=lambda result: result["best"])
131
-
132
- print("=" * 60)
133
- print("Isolated Comparison")
134
- print("=" * 60)
135
- for result in results:
136
- speedup = pillow_result["best"] / result["best"]
137
- print(
138
- f"{result['env_name']:<12} best={result['best']:.3f}s "
139
- f"throughput={result['throughput']:.0f} ops/s speedup={speedup:.3f}x"
140
- )
141
- print(f"fastest {fastest['env_name']} (best={fastest['best']:.3f}s)")
142
- return 0
143
-
144
-
145
- if __name__ == "__main__":
146
- raise SystemExit(main())
@@ -1,47 +0,0 @@
1
- from pathlib import Path
2
- from xml.etree import ElementTree as ET
3
-
4
- from libefiling.manifest import Manifest
5
-
6
-
7
- def get_document_code(manifest_path: str) -> str | None:
8
- """Get document code from manifest file
9
-
10
- Args:
11
- manifest_path (str): manifest file path (e.g. manifest.json)
12
- Returns:
13
- str: document code (e.g. A163)
14
- """
15
- mp = Path(manifest_path)
16
- manifest = Manifest.model_validate_json(mp.read_text(encoding="utf-8"))
17
- manifest_dir = mp.parent
18
- xml_dir = manifest_dir / manifest.paths.xml_dir
19
- for xml in manifest.xml_files:
20
- if xml.kind == "procedure":
21
- return get_document_code_from_procedure(str(xml_dir / xml.filename))
22
- else:
23
- return None
24
-
25
-
26
- def get_document_code_from_procedure(procedure_path: str) -> str | None:
27
- """Get document code from procedure.xml file path
28
-
29
- Args:
30
- procedure_path (str): procedure.xml file path
31
- Returns:
32
- str: document code (e.g. A163)
33
- """
34
- ns = {"jp": "http://www.jpo.go.jp"}
35
- tree = ET.parse(procedure_path)
36
- elem = tree.find(".//jp:document-name", ns)
37
- if elem is None:
38
- return None
39
-
40
- # Namespaced attributes are stored as expanded QName keys.
41
- code = elem.get("{http://www.jpo.go.jp}document-code")
42
- return code.strip() if code else None
43
-
44
- if __name__ == "__main__":
45
- import sys
46
-
47
- print(get_document_code(sys.argv[1]))
File without changes
File without changes
File without changes
File without changes
File without changes