libefiling 0.1.60__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {libefiling-0.1.60 → libefiling-0.2.0}/.gitignore +1 -0
  2. {libefiling-0.1.60 → libefiling-0.2.0}/PKG-INFO +19 -8
  3. {libefiling-0.1.60 → libefiling-0.2.0}/README.md +18 -7
  4. {libefiling-0.1.60 → libefiling-0.2.0}/docs/manifest.md +9 -14
  5. {libefiling-0.1.60 → libefiling-0.2.0}/pyproject.toml +20 -1
  6. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/__init__.py +1 -2
  7. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/utils.py +6 -6
  8. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/manifest.py +55 -24
  9. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/parse.py +23 -47
  10. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/xml/kind.py +6 -0
  11. libefiling-0.1.60/docs/benchmark_process_images.py +0 -43
  12. libefiling-0.1.60/docs/benchmark_resize.py +0 -137
  13. libefiling-0.1.60/docs/benchmark_resize_isolated.py +0 -145
  14. libefiling-0.1.60/src/libefiling/xml/utils.py +0 -48
  15. {libefiling-0.1.60 → libefiling-0.2.0}/LICENSE +0 -0
  16. {libefiling-0.1.60 → libefiling-0.2.0}/docs/README.md +0 -0
  17. {libefiling-0.1.60 → libefiling-0.2.0}/docs/archive_structure_notes.md +0 -0
  18. {libefiling-0.1.60 → libefiling-0.2.0}/docs/file-1.png +0 -0
  19. {libefiling-0.1.60 → libefiling-0.2.0}/docs/file-2.png +0 -0
  20. {libefiling-0.1.60 → libefiling-0.2.0}/docs/file-3.png +0 -0
  21. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/__init__.py +0 -0
  22. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/aaa.py +0 -0
  23. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/extract.py +0 -0
  24. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/handler.py +0 -0
  25. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/nnf.py +0 -0
  26. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/charset.py +0 -0
  27. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/cli.py +0 -0
  28. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/default_config.py +0 -0
  29. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/__init__.py +0 -0
  30. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/convert.py +0 -0
  31. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/kind.py +0 -0
  32. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/mediatype.py +0 -0
  33. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/ocr.py +0 -0
  34. {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/params.py +0 -0
@@ -178,3 +178,4 @@ old
178
178
  output2
179
179
  ./images
180
180
  ./images
181
+ .envrc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: libefiling
3
- Version: 0.1.60
3
+ Version: 0.2.0
4
4
  Summary: A Python library for e-filing systems.
5
5
  Project-URL: Homepage, https://github.com/hyperion13th144m/libefiling
6
6
  Project-URL: Repository, https://github.com/hyperion13th144m/libefiling
@@ -53,7 +53,7 @@ pip install libefiling
53
53
 
54
54
  ## 使い方
55
55
  ```python
56
- from libefiling import parse_archive, ImageConvertParam, generate_sha256, get_document_code, get_doc_id
56
+ from libefiling import parse_archive, ImageConvertParam, Source
57
57
 
58
58
  params = [
59
59
  ImageConvertParam(
@@ -88,8 +88,13 @@ OUT='output'
88
88
  ### "chemical-formulas", "figures", "equations", "tables", "other-images", "ALL"
89
89
  ### ]
90
90
  ocr_target = ["other-images"]
91
- doc_id = generate_sha256(SRC)
92
- if doc_id === '...':
91
+
92
+ # src のハッシュ値や文書コードを生成して、処理するか判定する例
93
+ source = Source.create(SRC)
94
+ document_code = source.get_document_code()
95
+ if document_code not in ['A163', 'A151']:
96
+ raise ValueError(f"Unsupported document code: {document_code}")
97
+ if source.sha256 == '...'
93
98
  print("Already processed")
94
99
  else:
95
100
  parse_archive(
@@ -101,8 +106,7 @@ else:
101
106
  image_max_workers=0, # 0: CPU数に応じて自動
102
107
  )
103
108
 
104
- print(get_document_code(SRC))
105
- print(get_doc_id("output/manifest.json"))
109
+
106
110
  ```
107
111
  - generate_sha256 はアーカイブの内容に応じたハッシュ値を生成し、再処理判定用に使える。
108
112
  - parse_archive は SRC,PROCを OUTに展開する。第4引数に、画像変換のパラメータを渡せる。
@@ -111,8 +115,7 @@ OUT に各種ファイルが展開される。第5引数はOCR処理対象の画
111
115
  - image_max_workers が 1 のとき: シリアル実行
112
116
  - image_max_workers が 2 以上のとき: スレッド並列実行
113
117
  - image_max_workers が 0 のとき: CPU数ベースで自動設定
114
- - get_document_code アーカイブのパス名か、parse_archive で生成された manifest.json のパスを与えると、文書コード(e.g. A163)を返す。
115
- - get_doc_id は parse_archive で生成された manifest.json のパスを与えると、doc_id を返す。
118
+ - source = Source.create(SRC) source は、manifest.json, xml/sources.xml の内容とおなじ。parse_archive するまえに、source.sha256 を得られるということ。
116
119
 
117
120
  ### 画像変換の高速化オプション
118
121
  既定では Pillow でリサイズします。環境変数 LIBEFILING_RESIZER_BACKEND を指定すると、
@@ -196,3 +199,11 @@ MIT ライセンス
196
199
  0.1.60
197
200
  - get_document_code 関数は、manifest.jsonだけでなく、アーカイブパス・手続ファイルを与えても文書コードを返すようにした。
198
201
  - manifest.json に 文書コードを含めた
202
+
203
+ 0.2.0
204
+ - manifest.json の documents フィールドを sources フィールドに変更した。
205
+ - sources の子要素は配列でなく archive, procedure とした。
206
+ - sources.document_code フィールドは、文書コードを表す
207
+ - get_document_code 廃止,Source クラスの get_document_code で代替
208
+ - get_doc_id, generate_sha256 関数廃止, Source クラスの sha256 で代替
209
+ - xml/sources.xml をはき出すようにした. manifest.json の sources フィールドと同じ内容を表す。
@@ -33,7 +33,7 @@ pip install libefiling
33
33
 
34
34
  ## 使い方
35
35
  ```python
36
- from libefiling import parse_archive, ImageConvertParam, generate_sha256, get_document_code, get_doc_id
36
+ from libefiling import parse_archive, ImageConvertParam, Source
37
37
 
38
38
  params = [
39
39
  ImageConvertParam(
@@ -68,8 +68,13 @@ OUT='output'
68
68
  ### "chemical-formulas", "figures", "equations", "tables", "other-images", "ALL"
69
69
  ### ]
70
70
  ocr_target = ["other-images"]
71
- doc_id = generate_sha256(SRC)
72
- if doc_id === '...':
71
+
72
+ # src のハッシュ値や文書コードを生成して、処理するか判定する例
73
+ source = Source.create(SRC)
74
+ document_code = source.get_document_code()
75
+ if document_code not in ['A163', 'A151']:
76
+ raise ValueError(f"Unsupported document code: {document_code}")
77
+ if source.sha256 == '...'
73
78
  print("Already processed")
74
79
  else:
75
80
  parse_archive(
@@ -81,8 +86,7 @@ else:
81
86
  image_max_workers=0, # 0: CPU数に応じて自動
82
87
  )
83
88
 
84
- print(get_document_code(SRC))
85
- print(get_doc_id("output/manifest.json"))
89
+
86
90
  ```
87
91
  - generate_sha256 はアーカイブの内容に応じたハッシュ値を生成し、再処理判定用に使える。
88
92
  - parse_archive は SRC,PROCを OUTに展開する。第4引数に、画像変換のパラメータを渡せる。
@@ -91,8 +95,7 @@ OUT に各種ファイルが展開される。第5引数はOCR処理対象の画
91
95
  - image_max_workers が 1 のとき: シリアル実行
92
96
  - image_max_workers が 2 以上のとき: スレッド並列実行
93
97
  - image_max_workers が 0 のとき: CPU数ベースで自動設定
94
- - get_document_code アーカイブのパス名か、parse_archive で生成された manifest.json のパスを与えると、文書コード(e.g. A163)を返す。
95
- - get_doc_id は parse_archive で生成された manifest.json のパスを与えると、doc_id を返す。
98
+ - source = Source.create(SRC) source は、manifest.json, xml/sources.xml の内容とおなじ。parse_archive するまえに、source.sha256 を得られるということ。
96
99
 
97
100
  ### 画像変換の高速化オプション
98
101
  既定では Pillow でリサイズします。環境変数 LIBEFILING_RESIZER_BACKEND を指定すると、
@@ -176,3 +179,11 @@ MIT ライセンス
176
179
  0.1.60
177
180
  - get_document_code 関数は、manifest.jsonだけでなく、アーカイブパス・手続ファイルを与えても文書コードを返すようにした。
178
181
  - manifest.json に 文書コードを含めた
182
+
183
+ 0.2.0
184
+ - manifest.json の documents フィールドを sources フィールドに変更した。
185
+ - sources の子要素は配列でなく archive, procedure とした。
186
+ - sources.document_code フィールドは、文書コードを表す
187
+ - get_document_code 廃止,Source クラスの get_document_code で代替
188
+ - get_doc_id, generate_sha256 関数廃止, Source クラスの sha256 で代替
189
+ - xml/sources.xml をはき出すようにした. manifest.json の sources フィールドと同じ内容を表す。
@@ -40,7 +40,7 @@ manifest.json は、次の設計方針に基づいている。
40
40
  {
41
41
  "manifest_version": "1.0.0",
42
42
  "generator": { ... },
43
- "document": { ... },
43
+ "sources": { ... },
44
44
  "paths": { ... },
45
45
  "xml_files": [ ... ],
46
46
  "images": [ ... ],
@@ -71,13 +71,11 @@ manifest.json は、次の設計方針に基づいている。
71
71
  - 再現性やデバッグのために使用される
72
72
 
73
73
 
74
- ## 4.3 document
74
+ ## 4.3 sources
75
75
  ```json
76
- "document": {
77
- "doc_id": "D000001",
78
- "code": "A163",
79
- "sources": [
80
- {
76
+ "sources": {
77
+ "document_code": "A163",
78
+ "archive": {
81
79
  "filename": "...AAA.JWX",
82
80
  "sha256": "...",
83
81
  "byte_size": 12345678,
@@ -85,7 +83,7 @@ manifest.json は、次の設計方針に基づいている。
85
83
  "kind": "AA",
86
84
  "extension": ".JWX"
87
85
  },
88
- {
86
+ "procedure": {
89
87
  "filename": "...AFM.XML",
90
88
  "sha256": "...",
91
89
  "byte_size": 4220,
@@ -93,14 +91,12 @@ manifest.json は、次の設計方針に基づいている。
93
91
  "kind": "FM",
94
92
  "extension": ".XML"
95
93
  }
96
- ]
97
94
  }
98
95
  ```
99
96
 
100
- - doc_id は、この文書単位を一意に識別するためのID
101
- - code は、文書の分類コード
102
- - source は基になったファイルに関する情報
103
- - archive_sha256 は再処理判定や追跡用
97
+ - document_code は、文書の分類コード
98
+ - archive, procedure は基になったファイルに関する情報
99
+ - sha256 はarchive, procedure のファイル内容に基づいて生成されたハッシュ値。処理済みかどうかの判定に使える
104
100
  - task, kind, extension はファイル名から得られるアーカイブの属する業務、種類、拡張子
105
101
  - task の値は以下の通り
106
102
  - A: 出願
@@ -118,7 +114,6 @@ manifest.json は、次の設計方針に基づいている。
118
114
  - ER: 緊急避難用送信ファイル
119
115
  - FM: 手続情報管理ファイル
120
116
  - XX: 不明(上記に当てはまらない場合)
121
- - procedure_source は手続き情報ファイルに関する情報
122
117
 
123
118
  ### 4.4 paths
124
119
  ```json
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "libefiling"
3
- version = "0.1.60"
3
+ version = "0.2.0"
4
4
  description = "A Python library for e-filing systems."
5
5
  authors = [{ name = "hyperion13th144m", email = "hyperion13th144m@gmail.com" }]
6
6
  requires-python = ">=3.12"
@@ -41,3 +41,22 @@ include = ["src/libefiling"]
41
41
  [build-system]
42
42
  requires = ["hatchling"]
43
43
  build-backend = "hatchling.build"
44
+
45
+ [tool.ruff.lint]
46
+ # 1. Enable flake8-bugbear (`B`) rules, in addition to the defaults.
47
+ select = ["E4", "E7", "E9", "F", "B"]
48
+
49
+ # 2. Avoid enforcing line-length violations (`E501`)
50
+ ignore = ["E501"]
51
+
52
+ # 3. Avoid trying to fix flake8-bugbear (`B`) violations.
53
+ unfixable = ["B"]
54
+
55
+ # 4. Ignore `E402` (import violations) in all `__init__.py` files, and in selected subdirectories.
56
+ [tool.ruff.lint.per-file-ignores]
57
+ "**/{tests,docs,tools}/*" = ["E402"]
58
+ "__init__.py" = ["E402"]
59
+
60
+ [tool.ruff.format]
61
+ # 5. Use double quotes in `ruff format`.
62
+ quote-style = "double"
@@ -1,5 +1,4 @@
1
1
  from .archive.utils import generate_sha256
2
2
  from .image.params import ImageConvertParam
3
- from .manifest import Manifest, get_doc_id
3
+ from .manifest import Manifest, Source
4
4
  from .parse import parse_archive
5
- from .xml.utils import get_document_code
@@ -2,19 +2,19 @@ import hashlib
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- def generate_sha256(archive_path: str | Path) -> str:
6
- """return document sha256 based on archive_path content
5
+ def generate_sha256(file_path: str | Path) -> str:
6
+ """return document sha256 based on file_path content
7
7
 
8
8
  Args:
9
- archive_path (str | Path): archive path
9
+ file_path (str | Path): file path
10
10
 
11
11
  Returns:
12
12
  str: document sha256
13
13
  """
14
14
  sha256_hash = hashlib.sha256()
15
- if isinstance(archive_path, Path):
16
- archive_path = str(archive_path)
17
- with open(archive_path, "rb") as f:
15
+ if isinstance(file_path, Path):
16
+ file_path = str(file_path)
17
+ with open(file_path, "rb") as f:
18
18
  # Read and update hash string value in blocks of 4K
19
19
  for byte_block in iter(lambda: f.read(4096), b""):
20
20
  sha256_hash.update(byte_block)
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
- from enum import Enum
5
4
  from pathlib import Path
6
- from typing import List, Literal, Optional, get_args
5
+ from typing import List, Optional
6
+ from xml.etree import ElementTree as ET
7
7
 
8
8
  from pydantic import BaseModel, Field
9
9
 
10
+ from libefiling.archive.utils import generate_sha256
10
11
  from libefiling.image.kind import IMAGE_KIND
11
12
  from libefiling.xml.kind import XML_KIND
12
13
 
@@ -30,13 +31,14 @@ class Source(BaseModel):
30
31
  extension: str
31
32
 
32
33
  @classmethod
33
- def create(cls, file_path: str, sha256: str) -> Source:
34
+ def create(cls, file_path: str | Path) -> Source:
34
35
  """Create Source from file path
35
36
 
36
37
  Args:
37
- file_path (str): file path
38
+ file_path (str | Path): file path
38
39
  """
39
40
  filename = Path(file_path).name
41
+ sha256 = generate_sha256(file_path)
40
42
  byte_size = Path(file_path).stat().st_size
41
43
  if len(filename) == 63:
42
44
  task = filename[56 : 56 + 1]
@@ -54,11 +56,55 @@ class Source(BaseModel):
54
56
  extension=extension,
55
57
  )
56
58
 
59
+ def get_document_code(self) -> str:
60
+ """Get document code from archive file name
61
+
62
+ Args:
63
+ Returns:
64
+ str: document code (e.g. A163) or None if not found
65
+ """
66
+ if len(self.filename) < 29:
67
+ return "UNKNOWN"
68
+ else:
69
+ return self.filename[19 : 19 + 9].replace("_", "").strip()
57
70
 
58
- class DocumentInfo(BaseModel):
59
- doc_id: str
60
- code: str
61
- sources: List[Source]
71
+
72
+ class Sources(BaseModel):
73
+ document_code: str
74
+ archive: Source
75
+ procedure: Source
76
+
77
+ def save_as_xml(self, xml_path: str) -> None:
78
+ """Save Sources as XML file
79
+
80
+ Args:
81
+ xml_path (str): XML file path to save
82
+ """
83
+ root = ET.Element("sources", attrib={"document-code": self.document_code})
84
+ for source in [self.archive, self.procedure]:
85
+ ET.SubElement(
86
+ root,
87
+ "source",
88
+ attrib={
89
+ "filename": source.filename,
90
+ "sha256": source.sha256,
91
+ "byte-size": str(source.byte_size),
92
+ "task": source.task,
93
+ "kind": source.kind,
94
+ "extension": source.extension,
95
+ },
96
+ )
97
+ tree = ET.ElementTree(root)
98
+ tree.write(xml_path, encoding="utf-8", xml_declaration=True)
99
+
100
+ def to_xml_file(self, xml_path: str) -> XmlFile:
101
+ return XmlFile(
102
+ filename=Path(xml_path).name,
103
+ original_filename=None,
104
+ sha256=generate_sha256(xml_path),
105
+ encoding=EncodingInfo(detected="UTF-8", normalized_to="UTF-8"),
106
+ kind="source",
107
+ )
62
108
 
63
109
 
64
110
  # -------------------------
@@ -152,23 +198,8 @@ class Stats(BaseModel):
152
198
  class Manifest(BaseModel):
153
199
  manifest_version: str = "1.0.0"
154
200
  generator: GeneratorInfo
155
- document: DocumentInfo
201
+ sources: Sources
156
202
  paths: Paths = Paths()
157
203
  xml_files: List[XmlFile] = []
158
204
  images: List[ImageEntry] = []
159
205
  stats: Stats
160
- images: List[ImageEntry] = []
161
- stats: Stats
162
-
163
-
164
- def get_doc_id(manifest_path: str) -> str | None:
165
- """Get document ID from manifest file
166
-
167
- Args:
168
- manifest_path (str): manifest file path (e.g. manifest.json)
169
- Returns:
170
- str: document ID (e.g. 2024000000000)
171
- """
172
- mp = Path(manifest_path)
173
- manifest = Manifest.model_validate_json(mp.read_text(encoding="utf-8"))
174
- return manifest.document.doc_id.strip() if manifest.document.doc_id else None
@@ -5,14 +5,12 @@ from importlib.metadata import version as get_version
5
5
  from itertools import chain
6
6
  from pathlib import Path
7
7
  from typing import Iterable, Iterator, List
8
- from xml.etree import ElementTree as ET
9
8
 
10
9
  from libefiling.archive.utils import generate_sha256
11
10
  from libefiling.image.kind import OCR_TARGET, detect_image_kind
12
11
  from libefiling.image.mediatype import get_media_type
13
12
  from libefiling.manifest import (
14
13
  DerivedImage,
15
- DocumentInfo,
16
14
  EncodingInfo,
17
15
  GeneratorInfo,
18
16
  ImageAttributes,
@@ -20,6 +18,7 @@ from libefiling.manifest import (
20
18
  Manifest,
21
19
  OcrInfo,
22
20
  Source,
21
+ Sources,
23
22
  Stats,
24
23
  XmlFile,
25
24
  )
@@ -69,7 +68,8 @@ def parse_archive(
69
68
  xml_files = process_xml(raw_xml_files, xml_dir)
70
69
 
71
70
  ### convert charset of procedure xml to UTF-8 and save to xml_dir
72
- xml_files.append(process_procedure_xml(Path(src_procedure_path), xml_dir))
71
+ proc_xml_path = xml_dir / "procedure.xml"
72
+ xml_files.append(process_procedure_xml(Path(src_procedure_path), proc_xml_path))
73
73
 
74
74
  ### guess language
75
75
  lang = guess_language_by_filename(str(xml_dir))
@@ -89,16 +89,24 @@ def parse_archive(
89
89
  max_workers=image_max_workers,
90
90
  )
91
91
 
92
- code = get_document_code_from_procedure(str(xml_dir / "procedure.xml"))
92
+ ### generate sources.xml
93
+ source_archive = Source.create(src_archive_path)
94
+ source_proc = Source.create(src_procedure_path)
95
+ sources = Sources(
96
+ document_code=source_archive.get_document_code(),
97
+ archive=source_archive,
98
+ procedure=source_proc,
99
+ )
100
+ sources_xml_path = str(xml_dir / "sources.xml")
101
+ sources.save_as_xml(sources_xml_path)
102
+ xml_files.append(sources.to_xml_file(sources_xml_path))
93
103
 
94
104
  # generate manifest
95
105
  manifest = process_manifest(
96
- src_archive_path,
97
- src_procedure_path,
106
+ sources,
98
107
  str(xml_dir),
99
108
  xml_files,
100
109
  images,
101
- code=code if code else "UNKNOWN",
102
110
  )
103
111
 
104
112
  manifest_path = output_root / "manifest.json"
@@ -151,16 +159,14 @@ def process_xml(
151
159
 
152
160
  def process_procedure_xml(
153
161
  src_procedure_path: Path,
154
- xml_dir: Path,
155
- filename: str = "procedure.xml",
162
+ xml_path: Path,
156
163
  ) -> XmlFile:
157
- xml_path = xml_dir / filename
158
164
  convert_xml_charset(str(src_procedure_path), str(xml_path))
159
165
  return XmlFile(
160
- filename=filename,
166
+ filename=xml_path.name,
161
167
  encoding=EncodingInfo(detected="shift_jis", normalized_to="UTF-8"),
162
168
  sha256=generate_sha256(xml_path),
163
- kind=detect_xml_kind(filename),
169
+ kind=detect_xml_kind(xml_path.name),
164
170
  )
165
171
 
166
172
 
@@ -180,7 +186,9 @@ def process_images(
180
186
  workers = _resolve_worker_count(max_workers)
181
187
  if workers <= 1 or len(image_list) == 1:
182
188
  return [
183
- _process_single_image(image, images_dir, ocr_dir, image_params, lang, ocr_target)
189
+ _process_single_image(
190
+ image, images_dir, ocr_dir, image_params, lang, ocr_target
191
+ )
184
192
  for image in image_list
185
193
  ]
186
194
 
@@ -278,12 +286,10 @@ def get_ocr_text(image: Path, ocr_dir: Path, lang: str) -> OcrInfo:
278
286
 
279
287
 
280
288
  def process_manifest(
281
- src_archive_path: str,
282
- src_procedure_path: str,
289
+ sources: Sources,
283
290
  xml_dir: str,
284
291
  xml_files: list[XmlFile],
285
292
  images: list[ImageEntry],
286
- code: str,
287
293
  ) -> Manifest:
288
294
  manifest = Manifest(
289
295
  generator=GeneratorInfo(
@@ -291,18 +297,7 @@ def process_manifest(
291
297
  version=get_version("libefiling"),
292
298
  created_at=datetime.now(),
293
299
  ),
294
- document=DocumentInfo(
295
- doc_id=generate_sha256(src_archive_path),
296
- code=code,
297
- sources=[
298
- Source.create(
299
- src_archive_path, sha256=generate_sha256(src_archive_path)
300
- ),
301
- Source.create(
302
- src_procedure_path, sha256=generate_sha256(src_procedure_path)
303
- ),
304
- ],
305
- ),
300
+ sources=sources,
306
301
  xml_files=xml_files,
307
302
  images=images,
308
303
  stats=Stats(
@@ -314,22 +309,3 @@ def process_manifest(
314
309
  )
315
310
 
316
311
  return manifest
317
-
318
- def get_document_code_from_procedure(procedure_path: str) -> str | None:
319
- """Get document code from procedure.xml file path
320
-
321
- Args:
322
- procedure_path (str): procedure.xml file path
323
- Returns:
324
- str: document code (e.g. A163) or None if not found
325
- """
326
- ns = {"jp": "http://www.jpo.go.jp"}
327
- tree = ET.parse(procedure_path)
328
- elem = tree.find(".//jp:document-name", ns)
329
- if elem is None:
330
- return None
331
-
332
- # Namespaced attributes are stored as expanded QName keys.
333
- code = elem.get("{http://www.jpo.go.jp}document-code")
334
- return code.strip() if code else None
335
-
@@ -23,6 +23,7 @@ XML_KIND = Literal[
23
23
  "special-attached-documents",
24
24
  "special-st26-sequence-list",
25
25
  "procedure",
26
+ "source",
26
27
  "unknown",
27
28
  ]
28
29
 
@@ -139,6 +140,11 @@ re_xml: list[XML_RE_MAP] = [
139
140
  "regex": re.compile(r"procedure\.xml"),
140
141
  "description": "procedure XML procedure.xml",
141
142
  },
143
+ {
144
+ "kind": "source",
145
+ "regex": re.compile(r"source\.xml"),
146
+ "description": "source XML source.xml",
147
+ },
142
148
  ]
143
149
 
144
150
 
@@ -1,43 +0,0 @@
1
- from pathlib import Path
2
- from time import perf_counter
3
-
4
- from libefiling.default_config import defaultImageParams
5
- from libefiling.parse import process_images
6
-
7
- # Benchmark target: images/ 以下の tif 画像
8
- image_files = sorted(Path("images").rglob("*.tif"))
9
- if not image_files:
10
- raise SystemExit("No *.tif files found under images/")
11
-
12
- sample_size = min(120, len(image_files))
13
- sample = image_files[:sample_size]
14
- print(f"sample: {sample_size} files")
15
-
16
-
17
- def run_case(max_workers: int | None, out_root: Path) -> float:
18
- out_images = out_root / "images"
19
- out_ocr = out_root / "ocr"
20
- out_images.mkdir(parents=True, exist_ok=True)
21
- out_ocr.mkdir(parents=True, exist_ok=True)
22
-
23
- start = perf_counter()
24
- result = process_images(
25
- sample,
26
- out_images,
27
- out_ocr,
28
- defaultImageParams,
29
- "jpn",
30
- None,
31
- max_workers=max_workers,
32
- )
33
- elapsed = perf_counter() - start
34
- print(
35
- f"max_workers={max_workers} elapsed={elapsed:.3f}s items={len(result)}"
36
- )
37
- return elapsed
38
-
39
-
40
- serial_sec = run_case(1, Path("/tmp/libefiling-bench-serial"))
41
- parallel_sec = run_case(0, Path("/tmp/libefiling-bench-auto"))
42
- print(f"speedup: {serial_sec / parallel_sec:.3f}x")
43
- print(f"time_reduction: {(serial_sec - parallel_sec) / serial_sec * 100:.2f}%")
@@ -1,137 +0,0 @@
1
- """Benchmark the current PIL implementation in the active environment."""
2
-
3
- import argparse
4
- import importlib.metadata
5
- import json
6
- import statistics
7
- from pathlib import Path
8
- from time import perf_counter
9
-
10
- import libefiling.image.convert as conv
11
- from libefiling.default_config import defaultImageParams
12
- from libefiling.image.convert import get_size, load_image, resize_image
13
-
14
-
15
- def parse_args() -> argparse.Namespace:
16
- parser = argparse.ArgumentParser()
17
- parser.add_argument("--sample-size", type=int, default=120)
18
- parser.add_argument("--repeats", type=int, default=3)
19
- parser.add_argument("--backend", default="pillow")
20
- parser.add_argument("--json", action="store_true")
21
- return parser.parse_args()
22
-
23
-
24
- args = parse_args()
25
-
26
- # --------------------------------------------------------------------------- #
27
- # 画像ファイルの収集
28
- # --------------------------------------------------------------------------- #
29
- DATA_ROOT = Path("images/var/data")
30
- image_files = sorted(DATA_ROOT.rglob("*.tif"))
31
- if not image_files:
32
- raise SystemExit(f"No *.tif files found under {DATA_ROOT}")
33
-
34
- MAX_SAMPLES = 120
35
- sample_files = image_files[: min(args.sample_size, len(image_files))]
36
- print(f"Total *.tif: {len(image_files)} → using {len(sample_files)} files")
37
-
38
- # --------------------------------------------------------------------------- #
39
- # pillow-simd の有無を確認
40
- # --------------------------------------------------------------------------- #
41
- try:
42
- simd_ver = importlib.metadata.version("pillow-simd")
43
- print(f"pillow-simd : {simd_ver}")
44
- except importlib.metadata.PackageNotFoundError:
45
- simd_ver = None
46
- print("pillow-simd : not installed (pillow-simd backend は pillow にフォールバックします)")
47
-
48
- try:
49
- pillow_ver = importlib.metadata.version("Pillow")
50
- except importlib.metadata.PackageNotFoundError:
51
- pillow_ver = "unknown"
52
- print(f"Pillow : {pillow_ver}")
53
-
54
- if simd_ver is not None:
55
- print("NOTE: pillow-simd をインストールした環境では PIL 自体が pillow-simd 実装です。")
56
- print(" このスクリプト内の 'pillow' と 'pillow-simd' は別バイナリ比較ではなく、")
57
- print(" 同じ PIL 実装に対する別コードパス比較になります。")
58
- else:
59
- print("NOTE: pillow-simd 未導入環境では 'pillow-simd' backend は Pillow にフォールバックします。")
60
- print(" このスクリプト内の 'pillow' と 'pillow-simd' は別バイナリ比較にはなりません。")
61
-
62
- print(" 真の比較を行うには、Pillow 環境と pillow-simd 環境を分けて個別に実行してください。")
63
- print()
64
-
65
- # --------------------------------------------------------------------------- #
66
- # 画像を事前にメモリへロード(I/O をベンチから除外)
67
- # --------------------------------------------------------------------------- #
68
- print("Loading images into memory...", end=" ", flush=True)
69
- loaded_images = []
70
- for p in sample_files:
71
- try:
72
- loaded_images.append(load_image(p))
73
- except Exception as e:
74
- print(f"\nSkipping {p}: {e}")
75
- print(f"{len(loaded_images)} images loaded.")
76
- print()
77
-
78
- # リサイズターゲット(defaultImageParams の全サイズを使用)
79
- resize_targets = [(p.width, p.height) for p in defaultImageParams]
80
-
81
- # --------------------------------------------------------------------------- #
82
- # ベンチマーク本体
83
- # --------------------------------------------------------------------------- #
84
- REPEATS = args.repeats
85
-
86
-
87
- def run_benchmark(backend: str) -> list[float]:
88
- """指定バックエンドで全サンプルをリサイズし、1回あたりの経過秒のリストを返す。"""
89
- conv.RESIZER_BACKEND = backend
90
- times: list[float] = []
91
- for _ in range(REPEATS):
92
- t0 = perf_counter()
93
- for img in loaded_images:
94
- for w, h in resize_targets:
95
- size = get_size(img, w, h)
96
- resize_image(img, size)
97
- elapsed = perf_counter() - t0
98
- times.append(elapsed)
99
- return times
100
-
101
-
102
- print(f"[{args.backend}] running ({REPEATS} reps) ...", flush=True)
103
- times = run_benchmark(args.backend)
104
- best = min(times)
105
- avg = statistics.mean(times)
106
- ops = len(loaded_images) * len(resize_targets)
107
- throughput = ops / best
108
-
109
- print(f" best={best:.3f}s avg={avg:.3f}s ({ops} resize ops/rep)")
110
- print()
111
- print("=" * 50)
112
- print(" Summary")
113
- print("=" * 50)
114
- print(f" backend {args.backend}")
115
- print(f" pillow {pillow_ver}")
116
- print(f" pillow-simd {simd_ver or 'not installed'}")
117
- print(f" best {best:.3f}s")
118
- print(f" avg {avg:.3f}s")
119
- print(f" throughput {throughput:.0f} ops/s")
120
-
121
- if args.json:
122
- print(
123
- json.dumps(
124
- {
125
- "backend": args.backend,
126
- "pillow": pillow_ver,
127
- "pillow_simd": simd_ver,
128
- "sample_size": len(sample_files),
129
- "repeats": REPEATS,
130
- "ops_per_repeat": ops,
131
- "times": times,
132
- "best": best,
133
- "avg": avg,
134
- "throughput": throughput,
135
- }
136
- )
137
- )
@@ -1,145 +0,0 @@
1
- """Run an isolated Pillow vs pillow-simd vs cykooz benchmark in separate virtualenvs."""
2
-
3
- import argparse
4
- import json
5
- import shutil
6
- import subprocess
7
- import sys
8
- from pathlib import Path
9
-
10
- ROOT = Path(__file__).resolve().parents[1]
11
- PYTHON = sys.executable
12
- ENV_ROOT = ROOT / ".bench-envs"
13
-
14
- COMMON_DEPS = [
15
- "asn1crypto>=1.5.1,<2.0.0",
16
- "pytesseract>=0.3.13,<0.4.0",
17
- "pydantic>=2.12.5,<3.0.0",
18
- ]
19
-
20
- ENV_SPECS = [
21
- {
22
- "name": "pillow",
23
- "packages": ["pillow"],
24
- "backend": "pillow",
25
- },
26
- {
27
- "name": "pillow-simd",
28
- "packages": ["pillow-simd"],
29
- "backend": "pillow",
30
- },
31
- {
32
- "name": "cykooz",
33
- "packages": ["pillow", "cykooz_resizer"],
34
- "backend": "cykooz",
35
- },
36
- ]
37
-
38
-
39
- def parse_args() -> argparse.Namespace:
40
- parser = argparse.ArgumentParser()
41
- parser.add_argument("--sample-size", type=int, default=120)
42
- parser.add_argument("--repeats", type=int, default=3)
43
- parser.add_argument("--keep-envs", action="store_true")
44
- return parser.parse_args()
45
-
46
-
47
- def run(cmd: list[str], cwd: Path | None = None) -> subprocess.CompletedProcess[str]:
48
- return subprocess.run(
49
- cmd,
50
- cwd=cwd or ROOT,
51
- check=True,
52
- text=True,
53
- capture_output=True,
54
- )
55
-
56
-
57
- def env_python(env_dir: Path) -> Path:
58
- return env_dir / "bin" / "python"
59
-
60
-
61
- def setup_env(env_name: str, packages: list[str]) -> Path:
62
- env_dir = ENV_ROOT / env_name
63
- if env_dir.exists():
64
- shutil.rmtree(env_dir)
65
-
66
- print(f"[setup] {env_name}")
67
- run(["uv", "venv", str(env_dir), "--python", PYTHON])
68
- python_bin = env_python(env_dir)
69
-
70
- run(["uv", "pip", "install", "--python", str(python_bin), "-e", ".", "--no-deps"])
71
- run(["uv", "pip", "install", "--python", str(python_bin), *COMMON_DEPS, *packages])
72
- return env_dir
73
-
74
-
75
- def benchmark_env(
76
- env_name: str,
77
- env_dir: Path,
78
- backend: str,
79
- sample_size: int,
80
- repeats: int,
81
- ) -> dict:
82
- python_bin = env_python(env_dir)
83
- result = run(
84
- [
85
- str(python_bin),
86
- "docs/benchmark_resize.py",
87
- "--backend",
88
- backend,
89
- "--sample-size",
90
- str(sample_size),
91
- "--repeats",
92
- str(repeats),
93
- "--json",
94
- ]
95
- )
96
- print(result.stdout)
97
- payload = json.loads(result.stdout.strip().splitlines()[-1])
98
- payload["env_name"] = env_name
99
- return payload
100
-
101
-
102
- def main() -> int:
103
- args = parse_args()
104
- ENV_ROOT.mkdir(exist_ok=True)
105
-
106
- env_dirs: dict[str, Path] = {}
107
- results: list[dict] = []
108
-
109
- for spec in ENV_SPECS:
110
- env_dirs[spec["name"]] = setup_env(spec["name"], spec["packages"])
111
-
112
- try:
113
- for spec in ENV_SPECS:
114
- results.append(
115
- benchmark_env(
116
- spec["name"],
117
- env_dirs[spec["name"]],
118
- spec["backend"],
119
- args.sample_size,
120
- args.repeats,
121
- )
122
- )
123
- finally:
124
- if not args.keep_envs:
125
- shutil.rmtree(ENV_ROOT, ignore_errors=True)
126
-
127
- by_name = {result["env_name"]: result for result in results}
128
- pillow_result = by_name["pillow"]
129
- fastest = min(results, key=lambda result: result["best"])
130
-
131
- print("=" * 60)
132
- print("Isolated Comparison")
133
- print("=" * 60)
134
- for result in results:
135
- speedup = pillow_result["best"] / result["best"]
136
- print(
137
- f"{result['env_name']:<12} best={result['best']:.3f}s "
138
- f"throughput={result['throughput']:.0f} ops/s speedup={speedup:.3f}x"
139
- )
140
- print(f"fastest {fastest['env_name']} (best={fastest['best']:.3f}s)")
141
- return 0
142
-
143
-
144
- if __name__ == "__main__":
145
- raise SystemExit(main())
@@ -1,48 +0,0 @@
1
- from libefiling.manifest import Manifest
2
-
3
-
4
- def get_document_code(file_path: str) -> str | None:
5
- """Get document code from manifest, archive or procedure file
6
-
7
- Args:
8
- file_path (str): manifest, archive or procedure file path
9
- Returns:
10
- str: document code (e.g. A163) or None if not found
11
- """
12
- if file_path.endswith("manifest.json"):
13
- return get_document_code_from_manifest(file_path)
14
- else:
15
- return get_document_code_from_filename(file_path)
16
-
17
-
18
- def get_document_code_from_manifest(manifest_path: str) -> str | None:
19
- """Get document code from manifest file path
20
-
21
- Args:
22
- manifest_path (str): manifest file path
23
- Returns:
24
- str: document code (e.g. A163) or None if not found
25
- """
26
- with open(manifest_path, "r", encoding="utf-8") as f:
27
- manifest = Manifest.model_validate(f.read())
28
- return manifest.document.code if manifest.document.code else None
29
-
30
-
31
-
32
- def get_document_code_from_filename(file_path: str) -> str | None:
33
- """Get document code from archive file name
34
-
35
- Args:
36
- file_path (str): archive file path
37
- Returns:
38
- str: document code (e.g. A163) or None if not found
39
- """
40
- if len(file_path) < 29:
41
- return None
42
- else:
43
- return file_path[20:20 + 9].replace("_", "").strip()
44
-
45
- if __name__ == "__main__":
46
- import sys
47
-
48
- print(get_document_code(sys.argv[1]))
File without changes
File without changes
File without changes
File without changes
File without changes