libefiling 0.1.60__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {libefiling-0.1.60 → libefiling-0.2.0}/.gitignore +1 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/PKG-INFO +19 -8
- {libefiling-0.1.60 → libefiling-0.2.0}/README.md +18 -7
- {libefiling-0.1.60 → libefiling-0.2.0}/docs/manifest.md +9 -14
- {libefiling-0.1.60 → libefiling-0.2.0}/pyproject.toml +20 -1
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/__init__.py +1 -2
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/utils.py +6 -6
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/manifest.py +55 -24
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/parse.py +23 -47
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/xml/kind.py +6 -0
- libefiling-0.1.60/docs/benchmark_process_images.py +0 -43
- libefiling-0.1.60/docs/benchmark_resize.py +0 -137
- libefiling-0.1.60/docs/benchmark_resize_isolated.py +0 -145
- libefiling-0.1.60/src/libefiling/xml/utils.py +0 -48
- {libefiling-0.1.60 → libefiling-0.2.0}/LICENSE +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/docs/README.md +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/docs/archive_structure_notes.md +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/docs/file-1.png +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/docs/file-2.png +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/docs/file-3.png +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/__init__.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/aaa.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/extract.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/handler.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/archive/nnf.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/charset.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/cli.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/default_config.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/__init__.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/convert.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/kind.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/mediatype.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/ocr.py +0 -0
- {libefiling-0.1.60 → libefiling-0.2.0}/src/libefiling/image/params.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: libefiling
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A Python library for e-filing systems.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hyperion13th144m/libefiling
|
|
6
6
|
Project-URL: Repository, https://github.com/hyperion13th144m/libefiling
|
|
@@ -53,7 +53,7 @@ pip install libefiling
|
|
|
53
53
|
|
|
54
54
|
## 使い方
|
|
55
55
|
```python
|
|
56
|
-
from libefiling import parse_archive, ImageConvertParam,
|
|
56
|
+
from libefiling import parse_archive, ImageConvertParam, Source
|
|
57
57
|
|
|
58
58
|
params = [
|
|
59
59
|
ImageConvertParam(
|
|
@@ -88,8 +88,13 @@ OUT='output'
|
|
|
88
88
|
### "chemical-formulas", "figures", "equations", "tables", "other-images", "ALL"
|
|
89
89
|
### ]
|
|
90
90
|
ocr_target = ["other-images"]
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
|
|
92
|
+
# src のハッシュ値や文書コードを生成して、処理するか判定する例
|
|
93
|
+
source = Source.create(SRC)
|
|
94
|
+
document_code = source.get_document_code()
|
|
95
|
+
if document_code not in ['A163', 'A151']:
|
|
96
|
+
raise ValueError(f"Unsupported document code: {document_code}")
|
|
97
|
+
if source.sha256 == '...'
|
|
93
98
|
print("Already processed")
|
|
94
99
|
else:
|
|
95
100
|
parse_archive(
|
|
@@ -101,8 +106,7 @@ else:
|
|
|
101
106
|
image_max_workers=0, # 0: CPU数に応じて自動
|
|
102
107
|
)
|
|
103
108
|
|
|
104
|
-
|
|
105
|
-
print(get_doc_id("output/manifest.json"))
|
|
109
|
+
|
|
106
110
|
```
|
|
107
111
|
- generate_sha256 はアーカイブの内容に応じたハッシュ値を生成し、再処理判定用に使える。
|
|
108
112
|
- parse_archive は SRC,PROCを OUTに展開する。第4引数に、画像変換のパラメータを渡せる。
|
|
@@ -111,8 +115,7 @@ OUT に各種ファイルが展開される。第5引数はOCR処理対象の画
|
|
|
111
115
|
- image_max_workers が 1 のとき: シリアル実行
|
|
112
116
|
- image_max_workers が 2 以上のとき: スレッド並列実行
|
|
113
117
|
- image_max_workers が 0 のとき: CPU数ベースで自動設定
|
|
114
|
-
|
|
115
|
-
- get_doc_id は parse_archive で生成された manifest.json のパスを与えると、doc_id を返す。
|
|
118
|
+
- source = Source.create(SRC) の source は、manifest.json, xml/sources.xml の内容とおなじ。parse_archive するまえに、source.sha256 を得られるということ。
|
|
116
119
|
|
|
117
120
|
### 画像変換の高速化オプション
|
|
118
121
|
既定では Pillow でリサイズします。環境変数 LIBEFILING_RESIZER_BACKEND を指定すると、
|
|
@@ -196,3 +199,11 @@ MIT ライセンス
|
|
|
196
199
|
0.1.60
|
|
197
200
|
- get_document_code 関数は、manifest.jsonだけでなく、アーカイブパス・手続ファイルを与えても文書コードを返すようにした。
|
|
198
201
|
- manifest.json に 文書コードを含めた
|
|
202
|
+
|
|
203
|
+
0.2.0
|
|
204
|
+
- manifest.json の documents フィールドを sources フィールドに変更した。
|
|
205
|
+
- sources の子要素は配列でなく archive, procedure とした。
|
|
206
|
+
- sources.document_code フィールドは、文書コードを表す
|
|
207
|
+
- get_document_code 廃止,Source クラスの get_document_code で代替
|
|
208
|
+
- get_doc_id, generate_sha256 関数廃止, Source クラスの sha256 で代替
|
|
209
|
+
- xml/sources.xml をはき出すようにした. manifest.json の sources フィールドと同じ内容を表す。
|
|
@@ -33,7 +33,7 @@ pip install libefiling
|
|
|
33
33
|
|
|
34
34
|
## 使い方
|
|
35
35
|
```python
|
|
36
|
-
from libefiling import parse_archive, ImageConvertParam,
|
|
36
|
+
from libefiling import parse_archive, ImageConvertParam, Source
|
|
37
37
|
|
|
38
38
|
params = [
|
|
39
39
|
ImageConvertParam(
|
|
@@ -68,8 +68,13 @@ OUT='output'
|
|
|
68
68
|
### "chemical-formulas", "figures", "equations", "tables", "other-images", "ALL"
|
|
69
69
|
### ]
|
|
70
70
|
ocr_target = ["other-images"]
|
|
71
|
-
|
|
72
|
-
|
|
71
|
+
|
|
72
|
+
# src のハッシュ値や文書コードを生成して、処理するか判定する例
|
|
73
|
+
source = Source.create(SRC)
|
|
74
|
+
document_code = source.get_document_code()
|
|
75
|
+
if document_code not in ['A163', 'A151']:
|
|
76
|
+
raise ValueError(f"Unsupported document code: {document_code}")
|
|
77
|
+
if source.sha256 == '...'
|
|
73
78
|
print("Already processed")
|
|
74
79
|
else:
|
|
75
80
|
parse_archive(
|
|
@@ -81,8 +86,7 @@ else:
|
|
|
81
86
|
image_max_workers=0, # 0: CPU数に応じて自動
|
|
82
87
|
)
|
|
83
88
|
|
|
84
|
-
|
|
85
|
-
print(get_doc_id("output/manifest.json"))
|
|
89
|
+
|
|
86
90
|
```
|
|
87
91
|
- generate_sha256 はアーカイブの内容に応じたハッシュ値を生成し、再処理判定用に使える。
|
|
88
92
|
- parse_archive は SRC,PROCを OUTに展開する。第4引数に、画像変換のパラメータを渡せる。
|
|
@@ -91,8 +95,7 @@ OUT に各種ファイルが展開される。第5引数はOCR処理対象の画
|
|
|
91
95
|
- image_max_workers が 1 のとき: シリアル実行
|
|
92
96
|
- image_max_workers が 2 以上のとき: スレッド並列実行
|
|
93
97
|
- image_max_workers が 0 のとき: CPU数ベースで自動設定
|
|
94
|
-
|
|
95
|
-
- get_doc_id は parse_archive で生成された manifest.json のパスを与えると、doc_id を返す。
|
|
98
|
+
- source = Source.create(SRC) の source は、manifest.json, xml/sources.xml の内容とおなじ。parse_archive するまえに、source.sha256 を得られるということ。
|
|
96
99
|
|
|
97
100
|
### 画像変換の高速化オプション
|
|
98
101
|
既定では Pillow でリサイズします。環境変数 LIBEFILING_RESIZER_BACKEND を指定すると、
|
|
@@ -176,3 +179,11 @@ MIT ライセンス
|
|
|
176
179
|
0.1.60
|
|
177
180
|
- get_document_code 関数は、manifest.jsonだけでなく、アーカイブパス・手続ファイルを与えても文書コードを返すようにした。
|
|
178
181
|
- manifest.json に 文書コードを含めた
|
|
182
|
+
|
|
183
|
+
0.2.0
|
|
184
|
+
- manifest.json の documents フィールドを sources フィールドに変更した。
|
|
185
|
+
- sources の子要素は配列でなく archive, procedure とした。
|
|
186
|
+
- sources.document_code フィールドは、文書コードを表す
|
|
187
|
+
- get_document_code 廃止,Source クラスの get_document_code で代替
|
|
188
|
+
- get_doc_id, generate_sha256 関数廃止, Source クラスの sha256 で代替
|
|
189
|
+
- xml/sources.xml をはき出すようにした. manifest.json の sources フィールドと同じ内容を表す。
|
|
@@ -40,7 +40,7 @@ manifest.json は、次の設計方針に基づいている。
|
|
|
40
40
|
{
|
|
41
41
|
"manifest_version": "1.0.0",
|
|
42
42
|
"generator": { ... },
|
|
43
|
-
"
|
|
43
|
+
"sources": { ... },
|
|
44
44
|
"paths": { ... },
|
|
45
45
|
"xml_files": [ ... ],
|
|
46
46
|
"images": [ ... ],
|
|
@@ -71,13 +71,11 @@ manifest.json は、次の設計方針に基づいている。
|
|
|
71
71
|
- 再現性やデバッグのために使用される
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
## 4.3
|
|
74
|
+
## 4.3 sources
|
|
75
75
|
```json
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"sources": [
|
|
80
|
-
{
|
|
76
|
+
"sources": {
|
|
77
|
+
"document_code": "A163",
|
|
78
|
+
"archive": {
|
|
81
79
|
"filename": "...AAA.JWX",
|
|
82
80
|
"sha256": "...",
|
|
83
81
|
"byte_size": 12345678,
|
|
@@ -85,7 +83,7 @@ manifest.json は、次の設計方針に基づいている。
|
|
|
85
83
|
"kind": "AA",
|
|
86
84
|
"extension": ".JWX"
|
|
87
85
|
},
|
|
88
|
-
|
|
86
|
+
"procedure": {
|
|
89
87
|
"filename": "...AFM.XML",
|
|
90
88
|
"sha256": "...",
|
|
91
89
|
"byte_size": 4220,
|
|
@@ -93,14 +91,12 @@ manifest.json は、次の設計方針に基づいている。
|
|
|
93
91
|
"kind": "FM",
|
|
94
92
|
"extension": ".XML"
|
|
95
93
|
}
|
|
96
|
-
]
|
|
97
94
|
}
|
|
98
95
|
```
|
|
99
96
|
|
|
100
|
-
-
|
|
101
|
-
-
|
|
102
|
-
-
|
|
103
|
-
- archive_sha256 は再処理判定や追跡用
|
|
97
|
+
- document_code は、文書の分類コード
|
|
98
|
+
- archive, procedure は基になったファイルに関する情報
|
|
99
|
+
- sha256 はarchive, procedure のファイル内容に基づいて生成されたハッシュ値。処理済みかどうかの判定に使える
|
|
104
100
|
- task, kind, extension はファイル名から得られるアーカイブの属する業務、種類、拡張子
|
|
105
101
|
- task の値は以下の通り
|
|
106
102
|
- A: 出願
|
|
@@ -118,7 +114,6 @@ manifest.json は、次の設計方針に基づいている。
|
|
|
118
114
|
- ER: 緊急避難用送信ファイル
|
|
119
115
|
- FM: 手続情報管理ファイル
|
|
120
116
|
- XX: 不明(上記に当てはまらない場合)
|
|
121
|
-
- procedure_source は手続き情報ファイルに関する情報
|
|
122
117
|
|
|
123
118
|
### 4.4 paths
|
|
124
119
|
```json
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "libefiling"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "A Python library for e-filing systems."
|
|
5
5
|
authors = [{ name = "hyperion13th144m", email = "hyperion13th144m@gmail.com" }]
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -41,3 +41,22 @@ include = ["src/libefiling"]
|
|
|
41
41
|
[build-system]
|
|
42
42
|
requires = ["hatchling"]
|
|
43
43
|
build-backend = "hatchling.build"
|
|
44
|
+
|
|
45
|
+
[tool.ruff.lint]
|
|
46
|
+
# 1. Enable flake8-bugbear (`B`) rules, in addition to the defaults.
|
|
47
|
+
select = ["E4", "E7", "E9", "F", "B"]
|
|
48
|
+
|
|
49
|
+
# 2. Avoid enforcing line-length violations (`E501`)
|
|
50
|
+
ignore = ["E501"]
|
|
51
|
+
|
|
52
|
+
# 3. Avoid trying to fix flake8-bugbear (`B`) violations.
|
|
53
|
+
unfixable = ["B"]
|
|
54
|
+
|
|
55
|
+
# 4. Ignore `E402` (import violations) in all `__init__.py` files, and in selected subdirectories.
|
|
56
|
+
[tool.ruff.lint.per-file-ignores]
|
|
57
|
+
"**/{tests,docs,tools}/*" = ["E402"]
|
|
58
|
+
"__init__.py" = ["E402"]
|
|
59
|
+
|
|
60
|
+
[tool.ruff.format]
|
|
61
|
+
# 5. Use double quotes in `ruff format`.
|
|
62
|
+
quote-style = "double"
|
|
@@ -2,19 +2,19 @@ import hashlib
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def generate_sha256(
|
|
6
|
-
"""return document sha256 based on
|
|
5
|
+
def generate_sha256(file_path: str | Path) -> str:
|
|
6
|
+
"""return document sha256 based on file_path content
|
|
7
7
|
|
|
8
8
|
Args:
|
|
9
|
-
|
|
9
|
+
file_path (str | Path): file path
|
|
10
10
|
|
|
11
11
|
Returns:
|
|
12
12
|
str: document sha256
|
|
13
13
|
"""
|
|
14
14
|
sha256_hash = hashlib.sha256()
|
|
15
|
-
if isinstance(
|
|
16
|
-
|
|
17
|
-
with open(
|
|
15
|
+
if isinstance(file_path, Path):
|
|
16
|
+
file_path = str(file_path)
|
|
17
|
+
with open(file_path, "rb") as f:
|
|
18
18
|
# Read and update hash string value in blocks of 4K
|
|
19
19
|
for byte_block in iter(lambda: f.read(4096), b""):
|
|
20
20
|
sha256_hash.update(byte_block)
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from enum import Enum
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import List,
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
from xml.etree import ElementTree as ET
|
|
7
7
|
|
|
8
8
|
from pydantic import BaseModel, Field
|
|
9
9
|
|
|
10
|
+
from libefiling.archive.utils import generate_sha256
|
|
10
11
|
from libefiling.image.kind import IMAGE_KIND
|
|
11
12
|
from libefiling.xml.kind import XML_KIND
|
|
12
13
|
|
|
@@ -30,13 +31,14 @@ class Source(BaseModel):
|
|
|
30
31
|
extension: str
|
|
31
32
|
|
|
32
33
|
@classmethod
|
|
33
|
-
def create(cls, file_path: str
|
|
34
|
+
def create(cls, file_path: str | Path) -> Source:
|
|
34
35
|
"""Create Source from file path
|
|
35
36
|
|
|
36
37
|
Args:
|
|
37
|
-
file_path (str): file path
|
|
38
|
+
file_path (str | Path): file path
|
|
38
39
|
"""
|
|
39
40
|
filename = Path(file_path).name
|
|
41
|
+
sha256 = generate_sha256(file_path)
|
|
40
42
|
byte_size = Path(file_path).stat().st_size
|
|
41
43
|
if len(filename) == 63:
|
|
42
44
|
task = filename[56 : 56 + 1]
|
|
@@ -54,11 +56,55 @@ class Source(BaseModel):
|
|
|
54
56
|
extension=extension,
|
|
55
57
|
)
|
|
56
58
|
|
|
59
|
+
def get_document_code(self) -> str:
|
|
60
|
+
"""Get document code from archive file name
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
Returns:
|
|
64
|
+
str: document code (e.g. A163) or None if not found
|
|
65
|
+
"""
|
|
66
|
+
if len(self.filename) < 29:
|
|
67
|
+
return "UNKNOWN"
|
|
68
|
+
else:
|
|
69
|
+
return self.filename[19 : 19 + 9].replace("_", "").strip()
|
|
57
70
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
71
|
+
|
|
72
|
+
class Sources(BaseModel):
|
|
73
|
+
document_code: str
|
|
74
|
+
archive: Source
|
|
75
|
+
procedure: Source
|
|
76
|
+
|
|
77
|
+
def save_as_xml(self, xml_path: str) -> None:
|
|
78
|
+
"""Save Sources as XML file
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
xml_path (str): XML file path to save
|
|
82
|
+
"""
|
|
83
|
+
root = ET.Element("sources", attrib={"document-code": self.document_code})
|
|
84
|
+
for source in [self.archive, self.procedure]:
|
|
85
|
+
ET.SubElement(
|
|
86
|
+
root,
|
|
87
|
+
"source",
|
|
88
|
+
attrib={
|
|
89
|
+
"filename": source.filename,
|
|
90
|
+
"sha256": source.sha256,
|
|
91
|
+
"byte-size": str(source.byte_size),
|
|
92
|
+
"task": source.task,
|
|
93
|
+
"kind": source.kind,
|
|
94
|
+
"extension": source.extension,
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
tree = ET.ElementTree(root)
|
|
98
|
+
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
|
|
99
|
+
|
|
100
|
+
def to_xml_file(self, xml_path: str) -> XmlFile:
|
|
101
|
+
return XmlFile(
|
|
102
|
+
filename=Path(xml_path).name,
|
|
103
|
+
original_filename=None,
|
|
104
|
+
sha256=generate_sha256(xml_path),
|
|
105
|
+
encoding=EncodingInfo(detected="UTF-8", normalized_to="UTF-8"),
|
|
106
|
+
kind="source",
|
|
107
|
+
)
|
|
62
108
|
|
|
63
109
|
|
|
64
110
|
# -------------------------
|
|
@@ -152,23 +198,8 @@ class Stats(BaseModel):
|
|
|
152
198
|
class Manifest(BaseModel):
|
|
153
199
|
manifest_version: str = "1.0.0"
|
|
154
200
|
generator: GeneratorInfo
|
|
155
|
-
|
|
201
|
+
sources: Sources
|
|
156
202
|
paths: Paths = Paths()
|
|
157
203
|
xml_files: List[XmlFile] = []
|
|
158
204
|
images: List[ImageEntry] = []
|
|
159
205
|
stats: Stats
|
|
160
|
-
images: List[ImageEntry] = []
|
|
161
|
-
stats: Stats
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def get_doc_id(manifest_path: str) -> str | None:
|
|
165
|
-
"""Get document ID from manifest file
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
manifest_path (str): manifest file path (e.g. manifest.json)
|
|
169
|
-
Returns:
|
|
170
|
-
str: document ID (e.g. 2024000000000)
|
|
171
|
-
"""
|
|
172
|
-
mp = Path(manifest_path)
|
|
173
|
-
manifest = Manifest.model_validate_json(mp.read_text(encoding="utf-8"))
|
|
174
|
-
return manifest.document.doc_id.strip() if manifest.document.doc_id else None
|
|
@@ -5,14 +5,12 @@ from importlib.metadata import version as get_version
|
|
|
5
5
|
from itertools import chain
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Iterable, Iterator, List
|
|
8
|
-
from xml.etree import ElementTree as ET
|
|
9
8
|
|
|
10
9
|
from libefiling.archive.utils import generate_sha256
|
|
11
10
|
from libefiling.image.kind import OCR_TARGET, detect_image_kind
|
|
12
11
|
from libefiling.image.mediatype import get_media_type
|
|
13
12
|
from libefiling.manifest import (
|
|
14
13
|
DerivedImage,
|
|
15
|
-
DocumentInfo,
|
|
16
14
|
EncodingInfo,
|
|
17
15
|
GeneratorInfo,
|
|
18
16
|
ImageAttributes,
|
|
@@ -20,6 +18,7 @@ from libefiling.manifest import (
|
|
|
20
18
|
Manifest,
|
|
21
19
|
OcrInfo,
|
|
22
20
|
Source,
|
|
21
|
+
Sources,
|
|
23
22
|
Stats,
|
|
24
23
|
XmlFile,
|
|
25
24
|
)
|
|
@@ -69,7 +68,8 @@ def parse_archive(
|
|
|
69
68
|
xml_files = process_xml(raw_xml_files, xml_dir)
|
|
70
69
|
|
|
71
70
|
### convert charset of procedure xml to UTF-8 and save to xml_dir
|
|
72
|
-
|
|
71
|
+
proc_xml_path = xml_dir / "procedure.xml"
|
|
72
|
+
xml_files.append(process_procedure_xml(Path(src_procedure_path), proc_xml_path))
|
|
73
73
|
|
|
74
74
|
### guess language
|
|
75
75
|
lang = guess_language_by_filename(str(xml_dir))
|
|
@@ -89,16 +89,24 @@ def parse_archive(
|
|
|
89
89
|
max_workers=image_max_workers,
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
### generate sources.xml
|
|
93
|
+
source_archive = Source.create(src_archive_path)
|
|
94
|
+
source_proc = Source.create(src_procedure_path)
|
|
95
|
+
sources = Sources(
|
|
96
|
+
document_code=source_archive.get_document_code(),
|
|
97
|
+
archive=source_archive,
|
|
98
|
+
procedure=source_proc,
|
|
99
|
+
)
|
|
100
|
+
sources_xml_path = str(xml_dir / "sources.xml")
|
|
101
|
+
sources.save_as_xml(sources_xml_path)
|
|
102
|
+
xml_files.append(sources.to_xml_file(sources_xml_path))
|
|
93
103
|
|
|
94
104
|
# generate manifest
|
|
95
105
|
manifest = process_manifest(
|
|
96
|
-
|
|
97
|
-
src_procedure_path,
|
|
106
|
+
sources,
|
|
98
107
|
str(xml_dir),
|
|
99
108
|
xml_files,
|
|
100
109
|
images,
|
|
101
|
-
code=code if code else "UNKNOWN",
|
|
102
110
|
)
|
|
103
111
|
|
|
104
112
|
manifest_path = output_root / "manifest.json"
|
|
@@ -151,16 +159,14 @@ def process_xml(
|
|
|
151
159
|
|
|
152
160
|
def process_procedure_xml(
|
|
153
161
|
src_procedure_path: Path,
|
|
154
|
-
|
|
155
|
-
filename: str = "procedure.xml",
|
|
162
|
+
xml_path: Path,
|
|
156
163
|
) -> XmlFile:
|
|
157
|
-
xml_path = xml_dir / filename
|
|
158
164
|
convert_xml_charset(str(src_procedure_path), str(xml_path))
|
|
159
165
|
return XmlFile(
|
|
160
|
-
filename=
|
|
166
|
+
filename=xml_path.name,
|
|
161
167
|
encoding=EncodingInfo(detected="shift_jis", normalized_to="UTF-8"),
|
|
162
168
|
sha256=generate_sha256(xml_path),
|
|
163
|
-
kind=detect_xml_kind(
|
|
169
|
+
kind=detect_xml_kind(xml_path.name),
|
|
164
170
|
)
|
|
165
171
|
|
|
166
172
|
|
|
@@ -180,7 +186,9 @@ def process_images(
|
|
|
180
186
|
workers = _resolve_worker_count(max_workers)
|
|
181
187
|
if workers <= 1 or len(image_list) == 1:
|
|
182
188
|
return [
|
|
183
|
-
_process_single_image(
|
|
189
|
+
_process_single_image(
|
|
190
|
+
image, images_dir, ocr_dir, image_params, lang, ocr_target
|
|
191
|
+
)
|
|
184
192
|
for image in image_list
|
|
185
193
|
]
|
|
186
194
|
|
|
@@ -278,12 +286,10 @@ def get_ocr_text(image: Path, ocr_dir: Path, lang: str) -> OcrInfo:
|
|
|
278
286
|
|
|
279
287
|
|
|
280
288
|
def process_manifest(
|
|
281
|
-
|
|
282
|
-
src_procedure_path: str,
|
|
289
|
+
sources: Sources,
|
|
283
290
|
xml_dir: str,
|
|
284
291
|
xml_files: list[XmlFile],
|
|
285
292
|
images: list[ImageEntry],
|
|
286
|
-
code: str,
|
|
287
293
|
) -> Manifest:
|
|
288
294
|
manifest = Manifest(
|
|
289
295
|
generator=GeneratorInfo(
|
|
@@ -291,18 +297,7 @@ def process_manifest(
|
|
|
291
297
|
version=get_version("libefiling"),
|
|
292
298
|
created_at=datetime.now(),
|
|
293
299
|
),
|
|
294
|
-
|
|
295
|
-
doc_id=generate_sha256(src_archive_path),
|
|
296
|
-
code=code,
|
|
297
|
-
sources=[
|
|
298
|
-
Source.create(
|
|
299
|
-
src_archive_path, sha256=generate_sha256(src_archive_path)
|
|
300
|
-
),
|
|
301
|
-
Source.create(
|
|
302
|
-
src_procedure_path, sha256=generate_sha256(src_procedure_path)
|
|
303
|
-
),
|
|
304
|
-
],
|
|
305
|
-
),
|
|
300
|
+
sources=sources,
|
|
306
301
|
xml_files=xml_files,
|
|
307
302
|
images=images,
|
|
308
303
|
stats=Stats(
|
|
@@ -314,22 +309,3 @@ def process_manifest(
|
|
|
314
309
|
)
|
|
315
310
|
|
|
316
311
|
return manifest
|
|
317
|
-
|
|
318
|
-
def get_document_code_from_procedure(procedure_path: str) -> str | None:
|
|
319
|
-
"""Get document code from procedure.xml file path
|
|
320
|
-
|
|
321
|
-
Args:
|
|
322
|
-
procedure_path (str): procedure.xml file path
|
|
323
|
-
Returns:
|
|
324
|
-
str: document code (e.g. A163) or None if not found
|
|
325
|
-
"""
|
|
326
|
-
ns = {"jp": "http://www.jpo.go.jp"}
|
|
327
|
-
tree = ET.parse(procedure_path)
|
|
328
|
-
elem = tree.find(".//jp:document-name", ns)
|
|
329
|
-
if elem is None:
|
|
330
|
-
return None
|
|
331
|
-
|
|
332
|
-
# Namespaced attributes are stored as expanded QName keys.
|
|
333
|
-
code = elem.get("{http://www.jpo.go.jp}document-code")
|
|
334
|
-
return code.strip() if code else None
|
|
335
|
-
|
|
@@ -23,6 +23,7 @@ XML_KIND = Literal[
|
|
|
23
23
|
"special-attached-documents",
|
|
24
24
|
"special-st26-sequence-list",
|
|
25
25
|
"procedure",
|
|
26
|
+
"source",
|
|
26
27
|
"unknown",
|
|
27
28
|
]
|
|
28
29
|
|
|
@@ -139,6 +140,11 @@ re_xml: list[XML_RE_MAP] = [
|
|
|
139
140
|
"regex": re.compile(r"procedure\.xml"),
|
|
140
141
|
"description": "procedure XML procedure.xml",
|
|
141
142
|
},
|
|
143
|
+
{
|
|
144
|
+
"kind": "source",
|
|
145
|
+
"regex": re.compile(r"source\.xml"),
|
|
146
|
+
"description": "source XML source.xml",
|
|
147
|
+
},
|
|
142
148
|
]
|
|
143
149
|
|
|
144
150
|
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from time import perf_counter
|
|
3
|
-
|
|
4
|
-
from libefiling.default_config import defaultImageParams
|
|
5
|
-
from libefiling.parse import process_images
|
|
6
|
-
|
|
7
|
-
# Benchmark target: images/ 以下の tif 画像
|
|
8
|
-
image_files = sorted(Path("images").rglob("*.tif"))
|
|
9
|
-
if not image_files:
|
|
10
|
-
raise SystemExit("No *.tif files found under images/")
|
|
11
|
-
|
|
12
|
-
sample_size = min(120, len(image_files))
|
|
13
|
-
sample = image_files[:sample_size]
|
|
14
|
-
print(f"sample: {sample_size} files")
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def run_case(max_workers: int | None, out_root: Path) -> float:
|
|
18
|
-
out_images = out_root / "images"
|
|
19
|
-
out_ocr = out_root / "ocr"
|
|
20
|
-
out_images.mkdir(parents=True, exist_ok=True)
|
|
21
|
-
out_ocr.mkdir(parents=True, exist_ok=True)
|
|
22
|
-
|
|
23
|
-
start = perf_counter()
|
|
24
|
-
result = process_images(
|
|
25
|
-
sample,
|
|
26
|
-
out_images,
|
|
27
|
-
out_ocr,
|
|
28
|
-
defaultImageParams,
|
|
29
|
-
"jpn",
|
|
30
|
-
None,
|
|
31
|
-
max_workers=max_workers,
|
|
32
|
-
)
|
|
33
|
-
elapsed = perf_counter() - start
|
|
34
|
-
print(
|
|
35
|
-
f"max_workers={max_workers} elapsed={elapsed:.3f}s items={len(result)}"
|
|
36
|
-
)
|
|
37
|
-
return elapsed
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
serial_sec = run_case(1, Path("/tmp/libefiling-bench-serial"))
|
|
41
|
-
parallel_sec = run_case(0, Path("/tmp/libefiling-bench-auto"))
|
|
42
|
-
print(f"speedup: {serial_sec / parallel_sec:.3f}x")
|
|
43
|
-
print(f"time_reduction: {(serial_sec - parallel_sec) / serial_sec * 100:.2f}%")
|
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
"""Benchmark the current PIL implementation in the active environment."""
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import importlib.metadata
|
|
5
|
-
import json
|
|
6
|
-
import statistics
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from time import perf_counter
|
|
9
|
-
|
|
10
|
-
import libefiling.image.convert as conv
|
|
11
|
-
from libefiling.default_config import defaultImageParams
|
|
12
|
-
from libefiling.image.convert import get_size, load_image, resize_image
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def parse_args() -> argparse.Namespace:
|
|
16
|
-
parser = argparse.ArgumentParser()
|
|
17
|
-
parser.add_argument("--sample-size", type=int, default=120)
|
|
18
|
-
parser.add_argument("--repeats", type=int, default=3)
|
|
19
|
-
parser.add_argument("--backend", default="pillow")
|
|
20
|
-
parser.add_argument("--json", action="store_true")
|
|
21
|
-
return parser.parse_args()
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
args = parse_args()
|
|
25
|
-
|
|
26
|
-
# --------------------------------------------------------------------------- #
|
|
27
|
-
# 画像ファイルの収集
|
|
28
|
-
# --------------------------------------------------------------------------- #
|
|
29
|
-
DATA_ROOT = Path("images/var/data")
|
|
30
|
-
image_files = sorted(DATA_ROOT.rglob("*.tif"))
|
|
31
|
-
if not image_files:
|
|
32
|
-
raise SystemExit(f"No *.tif files found under {DATA_ROOT}")
|
|
33
|
-
|
|
34
|
-
MAX_SAMPLES = 120
|
|
35
|
-
sample_files = image_files[: min(args.sample_size, len(image_files))]
|
|
36
|
-
print(f"Total *.tif: {len(image_files)} → using {len(sample_files)} files")
|
|
37
|
-
|
|
38
|
-
# --------------------------------------------------------------------------- #
|
|
39
|
-
# pillow-simd の有無を確認
|
|
40
|
-
# --------------------------------------------------------------------------- #
|
|
41
|
-
try:
|
|
42
|
-
simd_ver = importlib.metadata.version("pillow-simd")
|
|
43
|
-
print(f"pillow-simd : {simd_ver}")
|
|
44
|
-
except importlib.metadata.PackageNotFoundError:
|
|
45
|
-
simd_ver = None
|
|
46
|
-
print("pillow-simd : not installed (pillow-simd backend は pillow にフォールバックします)")
|
|
47
|
-
|
|
48
|
-
try:
|
|
49
|
-
pillow_ver = importlib.metadata.version("Pillow")
|
|
50
|
-
except importlib.metadata.PackageNotFoundError:
|
|
51
|
-
pillow_ver = "unknown"
|
|
52
|
-
print(f"Pillow : {pillow_ver}")
|
|
53
|
-
|
|
54
|
-
if simd_ver is not None:
|
|
55
|
-
print("NOTE: pillow-simd をインストールした環境では PIL 自体が pillow-simd 実装です。")
|
|
56
|
-
print(" このスクリプト内の 'pillow' と 'pillow-simd' は別バイナリ比較ではなく、")
|
|
57
|
-
print(" 同じ PIL 実装に対する別コードパス比較になります。")
|
|
58
|
-
else:
|
|
59
|
-
print("NOTE: pillow-simd 未導入環境では 'pillow-simd' backend は Pillow にフォールバックします。")
|
|
60
|
-
print(" このスクリプト内の 'pillow' と 'pillow-simd' は別バイナリ比較にはなりません。")
|
|
61
|
-
|
|
62
|
-
print(" 真の比較を行うには、Pillow 環境と pillow-simd 環境を分けて個別に実行してください。")
|
|
63
|
-
print()
|
|
64
|
-
|
|
65
|
-
# --------------------------------------------------------------------------- #
|
|
66
|
-
# 画像を事前にメモリへロード(I/O をベンチから除外)
|
|
67
|
-
# --------------------------------------------------------------------------- #
|
|
68
|
-
print("Loading images into memory...", end=" ", flush=True)
|
|
69
|
-
loaded_images = []
|
|
70
|
-
for p in sample_files:
|
|
71
|
-
try:
|
|
72
|
-
loaded_images.append(load_image(p))
|
|
73
|
-
except Exception as e:
|
|
74
|
-
print(f"\nSkipping {p}: {e}")
|
|
75
|
-
print(f"{len(loaded_images)} images loaded.")
|
|
76
|
-
print()
|
|
77
|
-
|
|
78
|
-
# リサイズターゲット(defaultImageParams の全サイズを使用)
|
|
79
|
-
resize_targets = [(p.width, p.height) for p in defaultImageParams]
|
|
80
|
-
|
|
81
|
-
# --------------------------------------------------------------------------- #
|
|
82
|
-
# ベンチマーク本体
|
|
83
|
-
# --------------------------------------------------------------------------- #
|
|
84
|
-
REPEATS = args.repeats
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def run_benchmark(backend: str) -> list[float]:
|
|
88
|
-
"""指定バックエンドで全サンプルをリサイズし、1回あたりの経過秒のリストを返す。"""
|
|
89
|
-
conv.RESIZER_BACKEND = backend
|
|
90
|
-
times: list[float] = []
|
|
91
|
-
for _ in range(REPEATS):
|
|
92
|
-
t0 = perf_counter()
|
|
93
|
-
for img in loaded_images:
|
|
94
|
-
for w, h in resize_targets:
|
|
95
|
-
size = get_size(img, w, h)
|
|
96
|
-
resize_image(img, size)
|
|
97
|
-
elapsed = perf_counter() - t0
|
|
98
|
-
times.append(elapsed)
|
|
99
|
-
return times
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
print(f"[{args.backend}] running ({REPEATS} reps) ...", flush=True)
|
|
103
|
-
times = run_benchmark(args.backend)
|
|
104
|
-
best = min(times)
|
|
105
|
-
avg = statistics.mean(times)
|
|
106
|
-
ops = len(loaded_images) * len(resize_targets)
|
|
107
|
-
throughput = ops / best
|
|
108
|
-
|
|
109
|
-
print(f" best={best:.3f}s avg={avg:.3f}s ({ops} resize ops/rep)")
|
|
110
|
-
print()
|
|
111
|
-
print("=" * 50)
|
|
112
|
-
print(" Summary")
|
|
113
|
-
print("=" * 50)
|
|
114
|
-
print(f" backend {args.backend}")
|
|
115
|
-
print(f" pillow {pillow_ver}")
|
|
116
|
-
print(f" pillow-simd {simd_ver or 'not installed'}")
|
|
117
|
-
print(f" best {best:.3f}s")
|
|
118
|
-
print(f" avg {avg:.3f}s")
|
|
119
|
-
print(f" throughput {throughput:.0f} ops/s")
|
|
120
|
-
|
|
121
|
-
if args.json:
|
|
122
|
-
print(
|
|
123
|
-
json.dumps(
|
|
124
|
-
{
|
|
125
|
-
"backend": args.backend,
|
|
126
|
-
"pillow": pillow_ver,
|
|
127
|
-
"pillow_simd": simd_ver,
|
|
128
|
-
"sample_size": len(sample_files),
|
|
129
|
-
"repeats": REPEATS,
|
|
130
|
-
"ops_per_repeat": ops,
|
|
131
|
-
"times": times,
|
|
132
|
-
"best": best,
|
|
133
|
-
"avg": avg,
|
|
134
|
-
"throughput": throughput,
|
|
135
|
-
}
|
|
136
|
-
)
|
|
137
|
-
)
|
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
"""Run an isolated Pillow vs pillow-simd vs cykooz benchmark in separate virtualenvs."""
|
|
2
|
-
|
|
3
|
-
import argparse
|
|
4
|
-
import json
|
|
5
|
-
import shutil
|
|
6
|
-
import subprocess
|
|
7
|
-
import sys
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
|
|
10
|
-
ROOT = Path(__file__).resolve().parents[1]
|
|
11
|
-
PYTHON = sys.executable
|
|
12
|
-
ENV_ROOT = ROOT / ".bench-envs"
|
|
13
|
-
|
|
14
|
-
COMMON_DEPS = [
|
|
15
|
-
"asn1crypto>=1.5.1,<2.0.0",
|
|
16
|
-
"pytesseract>=0.3.13,<0.4.0",
|
|
17
|
-
"pydantic>=2.12.5,<3.0.0",
|
|
18
|
-
]
|
|
19
|
-
|
|
20
|
-
ENV_SPECS = [
|
|
21
|
-
{
|
|
22
|
-
"name": "pillow",
|
|
23
|
-
"packages": ["pillow"],
|
|
24
|
-
"backend": "pillow",
|
|
25
|
-
},
|
|
26
|
-
{
|
|
27
|
-
"name": "pillow-simd",
|
|
28
|
-
"packages": ["pillow-simd"],
|
|
29
|
-
"backend": "pillow",
|
|
30
|
-
},
|
|
31
|
-
{
|
|
32
|
-
"name": "cykooz",
|
|
33
|
-
"packages": ["pillow", "cykooz_resizer"],
|
|
34
|
-
"backend": "cykooz",
|
|
35
|
-
},
|
|
36
|
-
]
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def parse_args() -> argparse.Namespace:
|
|
40
|
-
parser = argparse.ArgumentParser()
|
|
41
|
-
parser.add_argument("--sample-size", type=int, default=120)
|
|
42
|
-
parser.add_argument("--repeats", type=int, default=3)
|
|
43
|
-
parser.add_argument("--keep-envs", action="store_true")
|
|
44
|
-
return parser.parse_args()
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def run(cmd: list[str], cwd: Path | None = None) -> subprocess.CompletedProcess[str]:
|
|
48
|
-
return subprocess.run(
|
|
49
|
-
cmd,
|
|
50
|
-
cwd=cwd or ROOT,
|
|
51
|
-
check=True,
|
|
52
|
-
text=True,
|
|
53
|
-
capture_output=True,
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def env_python(env_dir: Path) -> Path:
|
|
58
|
-
return env_dir / "bin" / "python"
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def setup_env(env_name: str, packages: list[str]) -> Path:
|
|
62
|
-
env_dir = ENV_ROOT / env_name
|
|
63
|
-
if env_dir.exists():
|
|
64
|
-
shutil.rmtree(env_dir)
|
|
65
|
-
|
|
66
|
-
print(f"[setup] {env_name}")
|
|
67
|
-
run(["uv", "venv", str(env_dir), "--python", PYTHON])
|
|
68
|
-
python_bin = env_python(env_dir)
|
|
69
|
-
|
|
70
|
-
run(["uv", "pip", "install", "--python", str(python_bin), "-e", ".", "--no-deps"])
|
|
71
|
-
run(["uv", "pip", "install", "--python", str(python_bin), *COMMON_DEPS, *packages])
|
|
72
|
-
return env_dir
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def benchmark_env(
|
|
76
|
-
env_name: str,
|
|
77
|
-
env_dir: Path,
|
|
78
|
-
backend: str,
|
|
79
|
-
sample_size: int,
|
|
80
|
-
repeats: int,
|
|
81
|
-
) -> dict:
|
|
82
|
-
python_bin = env_python(env_dir)
|
|
83
|
-
result = run(
|
|
84
|
-
[
|
|
85
|
-
str(python_bin),
|
|
86
|
-
"docs/benchmark_resize.py",
|
|
87
|
-
"--backend",
|
|
88
|
-
backend,
|
|
89
|
-
"--sample-size",
|
|
90
|
-
str(sample_size),
|
|
91
|
-
"--repeats",
|
|
92
|
-
str(repeats),
|
|
93
|
-
"--json",
|
|
94
|
-
]
|
|
95
|
-
)
|
|
96
|
-
print(result.stdout)
|
|
97
|
-
payload = json.loads(result.stdout.strip().splitlines()[-1])
|
|
98
|
-
payload["env_name"] = env_name
|
|
99
|
-
return payload
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def main() -> int:
|
|
103
|
-
args = parse_args()
|
|
104
|
-
ENV_ROOT.mkdir(exist_ok=True)
|
|
105
|
-
|
|
106
|
-
env_dirs: dict[str, Path] = {}
|
|
107
|
-
results: list[dict] = []
|
|
108
|
-
|
|
109
|
-
for spec in ENV_SPECS:
|
|
110
|
-
env_dirs[spec["name"]] = setup_env(spec["name"], spec["packages"])
|
|
111
|
-
|
|
112
|
-
try:
|
|
113
|
-
for spec in ENV_SPECS:
|
|
114
|
-
results.append(
|
|
115
|
-
benchmark_env(
|
|
116
|
-
spec["name"],
|
|
117
|
-
env_dirs[spec["name"]],
|
|
118
|
-
spec["backend"],
|
|
119
|
-
args.sample_size,
|
|
120
|
-
args.repeats,
|
|
121
|
-
)
|
|
122
|
-
)
|
|
123
|
-
finally:
|
|
124
|
-
if not args.keep_envs:
|
|
125
|
-
shutil.rmtree(ENV_ROOT, ignore_errors=True)
|
|
126
|
-
|
|
127
|
-
by_name = {result["env_name"]: result for result in results}
|
|
128
|
-
pillow_result = by_name["pillow"]
|
|
129
|
-
fastest = min(results, key=lambda result: result["best"])
|
|
130
|
-
|
|
131
|
-
print("=" * 60)
|
|
132
|
-
print("Isolated Comparison")
|
|
133
|
-
print("=" * 60)
|
|
134
|
-
for result in results:
|
|
135
|
-
speedup = pillow_result["best"] / result["best"]
|
|
136
|
-
print(
|
|
137
|
-
f"{result['env_name']:<12} best={result['best']:.3f}s "
|
|
138
|
-
f"throughput={result['throughput']:.0f} ops/s speedup={speedup:.3f}x"
|
|
139
|
-
)
|
|
140
|
-
print(f"fastest {fastest['env_name']} (best={fastest['best']:.3f}s)")
|
|
141
|
-
return 0
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
if __name__ == "__main__":
|
|
145
|
-
raise SystemExit(main())
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
from libefiling.manifest import Manifest
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def get_document_code(file_path: str) -> str | None:
|
|
5
|
-
"""Get document code from manifest, archive or procedure file
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
file_path (str): manifest, archive or procedure file path
|
|
9
|
-
Returns:
|
|
10
|
-
str: document code (e.g. A163) or None if not found
|
|
11
|
-
"""
|
|
12
|
-
if file_path.endswith("manifest.json"):
|
|
13
|
-
return get_document_code_from_manifest(file_path)
|
|
14
|
-
else:
|
|
15
|
-
return get_document_code_from_filename(file_path)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def get_document_code_from_manifest(manifest_path: str) -> str | None:
|
|
19
|
-
"""Get document code from manifest file path
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
manifest_path (str): manifest file path
|
|
23
|
-
Returns:
|
|
24
|
-
str: document code (e.g. A163) or None if not found
|
|
25
|
-
"""
|
|
26
|
-
with open(manifest_path, "r", encoding="utf-8") as f:
|
|
27
|
-
manifest = Manifest.model_validate(f.read())
|
|
28
|
-
return manifest.document.code if manifest.document.code else None
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def get_document_code_from_filename(file_path: str) -> str | None:
|
|
33
|
-
"""Get document code from archive file name
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
file_path (str): archive file path
|
|
37
|
-
Returns:
|
|
38
|
-
str: document code (e.g. A163) or None if not found
|
|
39
|
-
"""
|
|
40
|
-
if len(file_path) < 29:
|
|
41
|
-
return None
|
|
42
|
-
else:
|
|
43
|
-
return file_path[20:20 + 9].replace("_", "").strip()
|
|
44
|
-
|
|
45
|
-
if __name__ == "__main__":
|
|
46
|
-
import sys
|
|
47
|
-
|
|
48
|
-
print(get_document_code(sys.argv[1]))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|