clichefactory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clichefactory/__about__.py +1 -0
- clichefactory/__init__.py +42 -0
- clichefactory/_config.py +164 -0
- clichefactory/_engine/__init__.py +6 -0
- clichefactory/_engine/adapters/__init__.py +0 -0
- clichefactory/_engine/adapters/csv_adapter.py +139 -0
- clichefactory/_engine/adapters/docling_adapter.py +280 -0
- clichefactory/_engine/adapters/eml_adapter.py +45 -0
- clichefactory/_engine/adapters/vlm_adapter.py +72 -0
- clichefactory/_engine/adapters/xlsx_adapter.py +142 -0
- clichefactory/_engine/ai_clients/__init__.py +20 -0
- clichefactory/_engine/ai_clients/anthropic_client.py +308 -0
- clichefactory/_engine/ai_clients/factory.py +104 -0
- clichefactory/_engine/ai_clients/gemini_client.py +424 -0
- clichefactory/_engine/ai_clients/json_utils.py +150 -0
- clichefactory/_engine/ai_clients/ollama_client.py +170 -0
- clichefactory/_engine/ai_clients/openai_client.py +257 -0
- clichefactory/_engine/ai_clients/prompts.py +21 -0
- clichefactory/_engine/ai_clients/protocol.py +71 -0
- clichefactory/_engine/ai_clients/tests/__init__.py +0 -0
- clichefactory/_engine/ai_clients/tests/test_gemini_usage_phases.py +32 -0
- clichefactory/_engine/ai_clients/usage_tracker.py +23 -0
- clichefactory/_engine/cache/__init__.py +10 -0
- clichefactory/_engine/cache/base_cacher.py +29 -0
- clichefactory/_engine/cache/file_system_cacher.py +49 -0
- clichefactory/_engine/config/__init__.py +5 -0
- clichefactory/_engine/config/base_config.py +58 -0
- clichefactory/_engine/contracts/document_metadata.py +24 -0
- clichefactory/_engine/contracts/fingerprinting.py +22 -0
- clichefactory/_engine/contracts/key_builder.py +47 -0
- clichefactory/_engine/contracts/model_schema.py +248 -0
- clichefactory/_engine/contracts/operations.py +28 -0
- clichefactory/_engine/contracts/payloads/deployment.py +24 -0
- clichefactory/_engine/contracts/payloads/inference.py +53 -0
- clichefactory/_engine/contracts/payloads/training.py +138 -0
- clichefactory/_engine/contracts/validators.py +45 -0
- clichefactory/_engine/extractors/__init__.py +12 -0
- clichefactory/_engine/metrics/__init__.py +5 -0
- clichefactory/_engine/metrics/metrics_config.py +39 -0
- clichefactory/_engine/metrics/metrics_config_example.yaml +22 -0
- clichefactory/_engine/models/document_model.py +52 -0
- clichefactory/_engine/models/normalized_doc.py +60 -0
- clichefactory/_engine/models/usage_summary.py +11 -0
- clichefactory/_engine/parsers/csv_parser.py +204 -0
- clichefactory/_engine/parsers/doc_parser.py +28 -0
- clichefactory/_engine/parsers/docling_pipeline_options.py +7 -0
- clichefactory/_engine/parsers/docx_parser.py +37 -0
- clichefactory/_engine/parsers/eml_parser.py +183 -0
- clichefactory/_engine/parsers/fallback_media_parser.py +100 -0
- clichefactory/_engine/parsers/image_parser.py +76 -0
- clichefactory/_engine/parsers/media_parser.py +88 -0
- clichefactory/_engine/parsers/media_parser_registry.py +79 -0
- clichefactory/_engine/parsers/parser_utils/eml_utils.py +181 -0
- clichefactory/_engine/parsers/parser_utils/image/__init__.py +6 -0
- clichefactory/_engine/parsers/parser_utils/image/image_pipeline.py +158 -0
- clichefactory/_engine/parsers/parser_utils/image/image_pipeline_options.py +43 -0
- clichefactory/_engine/parsers/parser_utils/image/parsers/__init__.py +13 -0
- clichefactory/_engine/parsers/parser_utils/image/parsers/docling.py +47 -0
- clichefactory/_engine/parsers/parser_utils/image/parsers/ocr_llm.py +74 -0
- clichefactory/_engine/parsers/parser_utils/image/parsers/pytesseract.py +47 -0
- clichefactory/_engine/parsers/parser_utils/image/parsers/rapidocr.py +47 -0
- clichefactory/_engine/parsers/parser_utils/lang_mapping.py +127 -0
- clichefactory/_engine/parsers/parser_utils/layout/__init__.py +4 -0
- clichefactory/_engine/parsers/parser_utils/media_router.py +30 -0
- clichefactory/_engine/parsers/parser_utils/media_type_detector.py +197 -0
- clichefactory/_engine/parsers/parser_utils/office_converter.py +125 -0
- clichefactory/_engine/parsers/parser_utils/pdf/__init__.py +5 -0
- clichefactory/_engine/parsers/parser_utils/pdf/classifier.py +58 -0
- clichefactory/_engine/parsers/parser_utils/pdf/docling_helpers.py +448 -0
- clichefactory/_engine/parsers/parser_utils/pdf/docling_pipeline_options.py +105 -0
- clichefactory/_engine/parsers/parser_utils/pdf/strategies/__init__.py +13 -0
- clichefactory/_engine/parsers/parser_utils/pdf/strategies/docling_baseline.py +79 -0
- clichefactory/_engine/parsers/parser_utils/pdf/strategies/docling_vlm.py +160 -0
- clichefactory/_engine/parsers/parser_utils/pdf/strategies/ocr_llm.py +97 -0
- clichefactory/_engine/parsers/parser_utils/pdf/strategies/pymupdf_structured.py +118 -0
- clichefactory/_engine/parsers/parser_utils/pdf_repair.py +102 -0
- clichefactory/_engine/parsers/parser_utils/prompts.py +97 -0
- clichefactory/_engine/parsers/pdf_parser.py +130 -0
- clichefactory/_engine/parsers/text_parser.py +51 -0
- clichefactory/_engine/parsers/xlsx_parser.py +310 -0
- clichefactory/_extract_finalize.py +41 -0
- clichefactory/_extract_validation.py +49 -0
- clichefactory/_local.py +488 -0
- clichefactory/_schema.py +248 -0
- clichefactory/_service.py +312 -0
- clichefactory/_service_url.py +23 -0
- clichefactory/_upload.py +244 -0
- clichefactory/_utils.py +146 -0
- clichefactory/cli.py +652 -0
- clichefactory/cliche.py +305 -0
- clichefactory/client.py +364 -0
- clichefactory/errors.py +61 -0
- clichefactory/types.py +131 -0
- clichefactory-0.1.0.dist-info/METADATA +473 -0
- clichefactory-0.1.0.dist-info/RECORD +98 -0
- clichefactory-0.1.0.dist-info/WHEEL +4 -0
- clichefactory-0.1.0.dist-info/entry_points.txt +2 -0
- clichefactory-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from clichefactory.__about__ import __version__
|
|
4
|
+
from clichefactory.client import Client, factory
|
|
5
|
+
from clichefactory.cliche import Cliche
|
|
6
|
+
from clichefactory.types import Endpoint, PartialExtraction, ParsingOptions, PostprocessFn
|
|
7
|
+
from clichefactory.errors import (
|
|
8
|
+
AuthenticationError,
|
|
9
|
+
ClicheFactoryError,
|
|
10
|
+
ConfigurationError,
|
|
11
|
+
ExtractionError,
|
|
12
|
+
ParsingError,
|
|
13
|
+
ServiceUnavailableError,
|
|
14
|
+
TrainingError,
|
|
15
|
+
UnsupportedModeError,
|
|
16
|
+
UnsupportedParserError,
|
|
17
|
+
UploadError,
|
|
18
|
+
ValidationError,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"__version__",
|
|
23
|
+
"AuthenticationError",
|
|
24
|
+
"Cliche",
|
|
25
|
+
"ClicheFactoryError",
|
|
26
|
+
"Client",
|
|
27
|
+
"ConfigurationError",
|
|
28
|
+
"Endpoint",
|
|
29
|
+
"ExtractionError",
|
|
30
|
+
"ParsingError",
|
|
31
|
+
"ParsingOptions",
|
|
32
|
+
"PartialExtraction",
|
|
33
|
+
"PostprocessFn",
|
|
34
|
+
"ServiceUnavailableError",
|
|
35
|
+
"TrainingError",
|
|
36
|
+
"UnsupportedModeError",
|
|
37
|
+
"UnsupportedParserError",
|
|
38
|
+
"UploadError",
|
|
39
|
+
"ValidationError",
|
|
40
|
+
"factory",
|
|
41
|
+
]
|
|
42
|
+
|
clichefactory/_config.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI configuration file management.
|
|
3
|
+
|
|
4
|
+
Reads/writes ~/.clichefactory/config.toml. Config precedence (highest first):
|
|
5
|
+
1. CLI flags
|
|
6
|
+
2. Environment variables
|
|
7
|
+
3. Config file (~/.clichefactory/config.toml)
|
|
8
|
+
4. Defaults
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import tomllib
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_CONFIG_DIR = Path.home() / ".clichefactory"
|
|
19
|
+
_CONFIG_FILE = _CONFIG_DIR / "config.toml"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ServiceConfig:
|
|
24
|
+
api_key: str = ""
|
|
25
|
+
base_url: str = ""
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class LocalConfig:
|
|
29
|
+
model: str = ""
|
|
30
|
+
api_key: str = ""
|
|
31
|
+
ocr_model: str = ""
|
|
32
|
+
ocr_api_key: str = ""
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class CLIConfig:
|
|
36
|
+
default_mode: str = "service"
|
|
37
|
+
service: ServiceConfig = field(default_factory=ServiceConfig)
|
|
38
|
+
local: LocalConfig = field(default_factory=LocalConfig)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def config_dir() -> Path:
|
|
42
|
+
return _CONFIG_DIR
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def config_file_path() -> Path:
|
|
46
|
+
return _CONFIG_FILE
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_config() -> CLIConfig:
|
|
50
|
+
"""Load config from ~/.clichefactory/config.toml. Returns defaults if file doesn't exist."""
|
|
51
|
+
cfg = CLIConfig()
|
|
52
|
+
if not _CONFIG_FILE.is_file():
|
|
53
|
+
return cfg
|
|
54
|
+
|
|
55
|
+
with open(_CONFIG_FILE, "rb") as f:
|
|
56
|
+
data = tomllib.load(f)
|
|
57
|
+
|
|
58
|
+
cfg.default_mode = data.get("default_mode", cfg.default_mode)
|
|
59
|
+
|
|
60
|
+
if "service" in data:
|
|
61
|
+
s = data["service"]
|
|
62
|
+
cfg.service.api_key = s.get("api_key", "")
|
|
63
|
+
cfg.service.base_url = s.get("base_url", "")
|
|
64
|
+
|
|
65
|
+
if "local" in data:
|
|
66
|
+
lo = data["local"]
|
|
67
|
+
cfg.local.model = lo.get("model", "")
|
|
68
|
+
cfg.local.api_key = lo.get("api_key", "")
|
|
69
|
+
cfg.local.ocr_model = lo.get("ocr_model", "")
|
|
70
|
+
cfg.local.ocr_api_key = lo.get("ocr_api_key", "")
|
|
71
|
+
|
|
72
|
+
return cfg
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def save_config(cfg: CLIConfig) -> Path:
|
|
76
|
+
"""Write config to ~/.clichefactory/config.toml. Returns the path written."""
|
|
77
|
+
_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
lines: list[str] = []
|
|
80
|
+
lines.append(f'default_mode = "{cfg.default_mode}"')
|
|
81
|
+
lines.append("")
|
|
82
|
+
|
|
83
|
+
lines.append("[service]")
|
|
84
|
+
lines.append(f'api_key = "{cfg.service.api_key}"')
|
|
85
|
+
if cfg.service.base_url:
|
|
86
|
+
lines.append(f'base_url = "{cfg.service.base_url}"')
|
|
87
|
+
lines.append("")
|
|
88
|
+
|
|
89
|
+
lines.append("[local]")
|
|
90
|
+
lines.append(f'model = "{cfg.local.model}"')
|
|
91
|
+
lines.append(f'api_key = "{cfg.local.api_key}"')
|
|
92
|
+
if cfg.local.ocr_model:
|
|
93
|
+
lines.append(f"")
|
|
94
|
+
lines.append(f"# Optional: separate model for OCR/VLM tasks (image-to-text).")
|
|
95
|
+
lines.append(f"# If not set, the main model is used for everything.")
|
|
96
|
+
lines.append(f"# Only needed when you want a cheaper/faster model for OCR")
|
|
97
|
+
lines.append(f"# while keeping a more capable model for extraction.")
|
|
98
|
+
lines.append(f'ocr_model = "{cfg.local.ocr_model}"')
|
|
99
|
+
if cfg.local.ocr_api_key and cfg.local.ocr_api_key != cfg.local.api_key:
|
|
100
|
+
lines.append(f'ocr_api_key = "{cfg.local.ocr_api_key}"')
|
|
101
|
+
lines.append("")
|
|
102
|
+
|
|
103
|
+
_CONFIG_FILE.write_text("\n".join(lines), encoding="utf-8")
|
|
104
|
+
return _CONFIG_FILE
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def resolve_api_key(*, cli_flag: str | None, cfg: CLIConfig) -> str:
|
|
108
|
+
"""Resolve ClicheFactory service API key: CLI flag > env > config file."""
|
|
109
|
+
if cli_flag:
|
|
110
|
+
return cli_flag
|
|
111
|
+
env = os.environ.get("CLICHEFACTORY_API_KEY", "")
|
|
112
|
+
if env:
|
|
113
|
+
return env
|
|
114
|
+
return cfg.service.api_key
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def resolve_base_url(*, cli_flag: str | None, cfg: CLIConfig) -> str | None:
|
|
118
|
+
"""Resolve service base URL: CLI flag > env > config file > None."""
|
|
119
|
+
if cli_flag:
|
|
120
|
+
return cli_flag
|
|
121
|
+
env = os.environ.get("CLICHEFACTORY_API_URL", "")
|
|
122
|
+
if env:
|
|
123
|
+
return env
|
|
124
|
+
return cfg.service.base_url or None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def resolve_model(*, cli_flag: str | None, cfg: CLIConfig) -> str:
|
|
128
|
+
"""Resolve LLM model name: CLI flag > env > config file."""
|
|
129
|
+
if cli_flag:
|
|
130
|
+
return cli_flag
|
|
131
|
+
env = os.environ.get("CLICHEFACTORY_LLM_MODEL_NAME") or os.environ.get("LLM_MODEL_NAME", "")
|
|
132
|
+
if env:
|
|
133
|
+
return env
|
|
134
|
+
return cfg.local.model
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def resolve_model_api_key(*, cli_flag: str | None, cfg: CLIConfig) -> str:
|
|
138
|
+
"""Resolve LLM API key: CLI flag > env > config file."""
|
|
139
|
+
if cli_flag:
|
|
140
|
+
return cli_flag
|
|
141
|
+
env = os.environ.get("CLICHEFACTORY_LLM_API_KEY") or os.environ.get("LLM_API_KEY", "")
|
|
142
|
+
if env:
|
|
143
|
+
return env
|
|
144
|
+
return cfg.local.api_key
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def resolve_ocr_model(*, cli_flag: str | None, cfg: CLIConfig) -> str:
|
|
148
|
+
"""Resolve OCR model name: CLI flag > env > config file > main model."""
|
|
149
|
+
if cli_flag:
|
|
150
|
+
return cli_flag
|
|
151
|
+
env = os.environ.get("CLICHEFACTORY_OCR_MODEL_NAME") or os.environ.get("OCR_MODEL_NAME", "")
|
|
152
|
+
if env:
|
|
153
|
+
return env
|
|
154
|
+
return cfg.local.ocr_model
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def resolve_ocr_api_key(*, cli_flag: str | None, cfg: CLIConfig, model_api_key: str) -> str:
|
|
158
|
+
"""Resolve OCR API key: CLI flag > env > config file > main model key."""
|
|
159
|
+
if cli_flag:
|
|
160
|
+
return cli_flag
|
|
161
|
+
env = os.environ.get("CLICHEFACTORY_OCR_API_KEY") or os.environ.get("OCR_API_KEY", "")
|
|
162
|
+
if env:
|
|
163
|
+
return env
|
|
164
|
+
return cfg.local.ocr_api_key or model_api_key
|
|
File without changes
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Sequence, List, Tuple
|
|
4
|
+
|
|
5
|
+
from clichefactory._engine.models.normalized_doc import NormalizedDoc
|
|
6
|
+
from clichefactory._engine.models.document_model import Page, Section, Heading, Table, Block
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CsvNormalizedDoc(NormalizedDoc):
|
|
10
|
+
def __init__(self, filename: str, blocks: List[Block], markdown: str) -> None:
|
|
11
|
+
self.filename = filename
|
|
12
|
+
self.summary_text = ""
|
|
13
|
+
self.markdown = markdown
|
|
14
|
+
|
|
15
|
+
# CSV is inherently tabular and has no notion of pages. Use a single synthetic page.
|
|
16
|
+
self.pages = (Page(index=1, size=None, blocks=tuple(blocks)),)
|
|
17
|
+
self.sections = self._build_sections_from_headings(self.pages[0].blocks)
|
|
18
|
+
|
|
19
|
+
self.images = tuple()
|
|
20
|
+
self.tables = tuple(b for b in self.pages[0].blocks if isinstance(b, Table))
|
|
21
|
+
|
|
22
|
+
def get_plain_text(self) -> str:
|
|
23
|
+
return self.get_markdown()
|
|
24
|
+
|
|
25
|
+
def get_markdown(self) -> str:
|
|
26
|
+
return self.markdown
|
|
27
|
+
|
|
28
|
+
def get_json(self, table_index: int = 0, header: bool = True) -> Optional[list[dict[str, str]]]:
|
|
29
|
+
"""
|
|
30
|
+
Convert the first table in this document to a list of row dicts.
|
|
31
|
+
|
|
32
|
+
- If header=True: uses row 0 as header keys.
|
|
33
|
+
- Missing/empty header cell => "col_{index}"
|
|
34
|
+
- Duplicate header names are disambiguated with _1, _2, ... in left-to-right order.
|
|
35
|
+
- If header=False: uses synthetic keys "col_{index}" and includes row 0 as data.
|
|
36
|
+
"""
|
|
37
|
+
if not self.tables:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
t = self.tables[table_index]
|
|
41
|
+
if not getattr(t, "cells", None):
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
max_row = max(c.row for c in t.cells)
|
|
45
|
+
max_col = max(c.col for c in t.cells)
|
|
46
|
+
|
|
47
|
+
grid: list[list[str]] = [[""] * (max_col + 1) for _ in range(max_row + 1)]
|
|
48
|
+
for c in t.cells:
|
|
49
|
+
if 0 <= c.row <= max_row and 0 <= c.col <= max_col:
|
|
50
|
+
grid[c.row][c.col] = (c.text or "").strip()
|
|
51
|
+
|
|
52
|
+
def make_unique_header(raw_keys: list[str]) -> list[str]:
|
|
53
|
+
base_keys: list[str] = []
|
|
54
|
+
counts: dict[str, int] = {}
|
|
55
|
+
for j, k in enumerate(raw_keys):
|
|
56
|
+
base = (k or "").strip()
|
|
57
|
+
if not base:
|
|
58
|
+
base = f"col_{j}"
|
|
59
|
+
base_keys.append(base)
|
|
60
|
+
counts[base] = counts.get(base, 0) + 1
|
|
61
|
+
|
|
62
|
+
seen: dict[str, int] = {}
|
|
63
|
+
unique: list[str] = []
|
|
64
|
+
for base in base_keys:
|
|
65
|
+
if counts.get(base, 0) > 1:
|
|
66
|
+
seen[base] = seen.get(base, 0) + 1
|
|
67
|
+
unique.append(f"{base}_{seen[base]}")
|
|
68
|
+
else:
|
|
69
|
+
unique.append(base)
|
|
70
|
+
return unique
|
|
71
|
+
|
|
72
|
+
if header and grid:
|
|
73
|
+
raw_header = grid[0]
|
|
74
|
+
keys = make_unique_header(raw_header)
|
|
75
|
+
start_row = 1
|
|
76
|
+
else:
|
|
77
|
+
keys = [f"col_{j}" for j in range(max_col + 1)]
|
|
78
|
+
start_row = 0
|
|
79
|
+
|
|
80
|
+
out: list[dict[str, str]] = []
|
|
81
|
+
for r in range(start_row, max_row + 1):
|
|
82
|
+
row_vals = grid[r]
|
|
83
|
+
if all(not (v or "").strip() for v in row_vals):
|
|
84
|
+
continue
|
|
85
|
+
out.append({keys[j]: (row_vals[j] or "") for j in range(len(keys))})
|
|
86
|
+
|
|
87
|
+
return out
|
|
88
|
+
|
|
89
|
+
def _build_sections_from_headings(self, blocks: Sequence[Block]) -> Sequence[Section]:
|
|
90
|
+
class _Builder:
|
|
91
|
+
def __init__(self, heading: Heading) -> None:
|
|
92
|
+
self.heading = heading
|
|
93
|
+
self.blocks: list[Block] = []
|
|
94
|
+
self.children: list[Section] = []
|
|
95
|
+
|
|
96
|
+
def finalize(self) -> Section:
|
|
97
|
+
return Section(
|
|
98
|
+
heading=self.heading,
|
|
99
|
+
blocks=tuple(self.blocks),
|
|
100
|
+
subsections=tuple(self.children),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
stack: list[Tuple[int, _Builder]] = []
|
|
104
|
+
roots: list[Section] = []
|
|
105
|
+
|
|
106
|
+
def push_section(h: Heading) -> None:
|
|
107
|
+
nonlocal stack, roots
|
|
108
|
+
b = _Builder(h)
|
|
109
|
+
lvl = max(1, int(h.level))
|
|
110
|
+
while stack and stack[-1][0] >= lvl:
|
|
111
|
+
_, closed = stack.pop()
|
|
112
|
+
sec = closed.finalize()
|
|
113
|
+
if stack:
|
|
114
|
+
stack[-1][1].children.append(sec)
|
|
115
|
+
else:
|
|
116
|
+
roots.append(sec)
|
|
117
|
+
stack.append((lvl, b))
|
|
118
|
+
|
|
119
|
+
def add_to_current(block: Block) -> None:
|
|
120
|
+
if not stack:
|
|
121
|
+
push_section(Heading(level=1, text="Document"))
|
|
122
|
+
stack[-1][1].blocks.append(block)
|
|
123
|
+
|
|
124
|
+
for blk in blocks:
|
|
125
|
+
if isinstance(blk, Heading):
|
|
126
|
+
push_section(blk)
|
|
127
|
+
else:
|
|
128
|
+
add_to_current(blk)
|
|
129
|
+
|
|
130
|
+
while stack:
|
|
131
|
+
_, b = stack.pop()
|
|
132
|
+
sec = b.finalize()
|
|
133
|
+
if stack:
|
|
134
|
+
stack[-1][1].children.append(sec)
|
|
135
|
+
else:
|
|
136
|
+
roots.append(sec)
|
|
137
|
+
|
|
138
|
+
return tuple(roots)
|
|
139
|
+
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import DefaultDict, Dict, Literal, Optional, Sequence, Tuple
|
|
3
|
+
|
|
4
|
+
from clichefactory._engine.models.normalized_doc import NormalizedDoc
|
|
5
|
+
from clichefactory._engine.models.document_model import BBox, Block, Heading, Image, Page
|
|
6
|
+
from clichefactory._engine.models.document_model import Paragraph, Section, Table, TableCell
|
|
7
|
+
from docling_core.types.doc.base import BoundingBox
|
|
8
|
+
from docling_core.types.doc.document import (
|
|
9
|
+
DocItem,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
PictureItem,
|
|
12
|
+
SectionHeaderItem,
|
|
13
|
+
TableItem,
|
|
14
|
+
NodeItem,
|
|
15
|
+
TextItem,
|
|
16
|
+
TitleItem,
|
|
17
|
+
ListItem,
|
|
18
|
+
CodeItem,
|
|
19
|
+
FormulaItem,
|
|
20
|
+
KeyValueItem
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DoclingNormalizedDoc(NormalizedDoc):
|
|
25
|
+
"""
|
|
26
|
+
NormalizedDoc from Docling document.
|
|
27
|
+
|
|
28
|
+
output_mode: "markdown" uses docling_document.export_to_markdown();
|
|
29
|
+
"structured" builds markdown from pages/sections/images/tables.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
docling_document: DoclingDocument,
|
|
35
|
+
*,
|
|
36
|
+
output_mode: Literal["markdown", "structured"] = "markdown",
|
|
37
|
+
) -> None:
|
|
38
|
+
self.docling_document = docling_document
|
|
39
|
+
self._output_mode = output_mode
|
|
40
|
+
self.pages: Sequence[Page] = self.build_pages()
|
|
41
|
+
self.sections: Sequence[Section] = self.build_sections()
|
|
42
|
+
|
|
43
|
+
self.images = tuple(
|
|
44
|
+
block
|
|
45
|
+
for page in self.pages
|
|
46
|
+
for block in page.blocks
|
|
47
|
+
if isinstance(block, Image)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
self.tables = tuple(
|
|
51
|
+
block
|
|
52
|
+
for page in self.pages
|
|
53
|
+
for block in page.blocks
|
|
54
|
+
if isinstance(block, Table)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# ----- Implementation of the NormalizedDoc interface -----
|
|
58
|
+
|
|
59
|
+
def get_plain_text(self) -> str:
|
|
60
|
+
return self.docling_document.export_to_text()
|
|
61
|
+
|
|
62
|
+
def get_markdown(self) -> str:
|
|
63
|
+
if self._output_mode == "markdown":
|
|
64
|
+
return self.docling_document.export_to_markdown()
|
|
65
|
+
from clichefactory._engine.parsers.parser_utils.pdf.docling_helpers import blocks_to_markdown
|
|
66
|
+
|
|
67
|
+
return blocks_to_markdown(self.pages, self.sections, self.images, self.tables)
|
|
68
|
+
|
|
69
|
+
# ----- Docling to document_model mapping functions -----
|
|
70
|
+
|
|
71
|
+
def build_pages(self) -> Sequence[Page]:
|
|
72
|
+
blocks_by_page = self._collect_blocks_by_page()
|
|
73
|
+
|
|
74
|
+
# If docling provides pages, respect them
|
|
75
|
+
if self.docling_document.pages:
|
|
76
|
+
pages: list[Page] = []
|
|
77
|
+
for index, page_item in self.docling_document.pages.items():
|
|
78
|
+
size_tuple = (page_item.size.width, page_item.size.height) if page_item.size else None
|
|
79
|
+
page_blocks = tuple(blocks_by_page.get(index, []))
|
|
80
|
+
pages.append(Page(index=index, size=size_tuple, blocks=page_blocks))
|
|
81
|
+
return tuple(pages)
|
|
82
|
+
|
|
83
|
+
# Otherwise: single synthetic page (DOCX pagination may be unavailable)
|
|
84
|
+
all_blocks: list[Block] = []
|
|
85
|
+
for page_idx in sorted(blocks_by_page.keys()):
|
|
86
|
+
all_blocks.extend(blocks_by_page[page_idx])
|
|
87
|
+
if not all_blocks:
|
|
88
|
+
# last resort: collect in reading order with no page assignment
|
|
89
|
+
for item, _ in self.docling_document.iterate_items(with_groups=False):
|
|
90
|
+
b = self._to_block(item)
|
|
91
|
+
if b is not None:
|
|
92
|
+
all_blocks.append(b)
|
|
93
|
+
|
|
94
|
+
return (Page(index=1, size=None, blocks=tuple(all_blocks)),)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def build_sections(self) -> Sequence[Section]:
|
|
98
|
+
"""
|
|
99
|
+
Build a semantic hierarchy from headings.
|
|
100
|
+
|
|
101
|
+
Strategy:
|
|
102
|
+
- Flatten blocks in reading order (page order, then block order).
|
|
103
|
+
- Start a new section when we hit a Heading.
|
|
104
|
+
- Nest based on heading level (H1 contains H2, etc.)
|
|
105
|
+
"""
|
|
106
|
+
flat_blocks: list[Block] = []
|
|
107
|
+
# reading order: by page index then appearance
|
|
108
|
+
for page in sorted(self.pages, key=lambda p: p.index):
|
|
109
|
+
flat_blocks.extend(page.blocks)
|
|
110
|
+
|
|
111
|
+
# Collect top-level sections using a stack of (level, SectionBuilder)
|
|
112
|
+
class _Builder:
|
|
113
|
+
def __init__(self, heading: Heading) -> None:
|
|
114
|
+
self.heading = heading
|
|
115
|
+
self.blocks: list[Block] = []
|
|
116
|
+
self.children: list[Section] = []
|
|
117
|
+
|
|
118
|
+
def finalize(self) -> Section:
|
|
119
|
+
return Section(heading=self.heading, blocks=tuple(self.blocks), subsections=tuple(self.children))
|
|
120
|
+
|
|
121
|
+
stack: list[Tuple[int, _Builder]] = []
|
|
122
|
+
roots: list[Section] = []
|
|
123
|
+
|
|
124
|
+
def push_section(h: Heading) -> None:
|
|
125
|
+
nonlocal stack, roots
|
|
126
|
+
b = _Builder(h)
|
|
127
|
+
lvl = max(1, int(h.level))
|
|
128
|
+
|
|
129
|
+
# pop until parent is strictly lower level
|
|
130
|
+
while stack and stack[-1][0] >= lvl:
|
|
131
|
+
closed_lvl, closed = stack.pop()
|
|
132
|
+
sec = closed.finalize()
|
|
133
|
+
if stack:
|
|
134
|
+
stack[-1][1].children.append(sec)
|
|
135
|
+
else:
|
|
136
|
+
roots.append(sec)
|
|
137
|
+
|
|
138
|
+
stack.append((lvl, b))
|
|
139
|
+
|
|
140
|
+
def add_to_current(block: Block) -> None:
|
|
141
|
+
if not stack:
|
|
142
|
+
# No heading seen yet: create an implicit H1 section so content isn’t lost
|
|
143
|
+
push_section(Heading(level=1, text="Document"))
|
|
144
|
+
stack[-1][1].blocks.append(block)
|
|
145
|
+
|
|
146
|
+
for blk in flat_blocks:
|
|
147
|
+
if isinstance(blk, Heading):
|
|
148
|
+
push_section(blk)
|
|
149
|
+
else:
|
|
150
|
+
add_to_current(blk)
|
|
151
|
+
|
|
152
|
+
# close remaining
|
|
153
|
+
while stack:
|
|
154
|
+
lvl, b = stack.pop()
|
|
155
|
+
sec = b.finalize()
|
|
156
|
+
if stack:
|
|
157
|
+
stack[-1][1].children.append(sec)
|
|
158
|
+
else:
|
|
159
|
+
roots.append(sec)
|
|
160
|
+
|
|
161
|
+
return tuple(roots)
|
|
162
|
+
|
|
163
|
+
# Collect all blocks for all pages (poor man's provenance))
|
|
164
|
+
def _collect_blocks_by_page(self) -> Dict[int, list[Block]]:
|
|
165
|
+
blocks: DefaultDict[int, list[Block]] = defaultdict(list)
|
|
166
|
+
for item, _ in self.docling_document.iterate_items(with_groups=False):
|
|
167
|
+
block = self._to_block(item)
|
|
168
|
+
|
|
169
|
+
if block is None:
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
if not isinstance(item, DocItem): # Could have been a GroupItem or NodeItem or similar semantic, non content descriptor
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
page_no = self._get_primary_page(item)
|
|
176
|
+
if page_no is None:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
blocks[page_no].append(block)
|
|
180
|
+
|
|
181
|
+
return blocks
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _to_block(self, item: NodeItem) -> Optional[Block]:
|
|
185
|
+
if not isinstance(item, DocItem):
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
bbox = self._item_bbox(item)
|
|
189
|
+
|
|
190
|
+
if isinstance(item, TableItem):
|
|
191
|
+
return self._table_to_block(item, bbox)
|
|
192
|
+
if isinstance(item, PictureItem):
|
|
193
|
+
return self._image_to_block(item, bbox)
|
|
194
|
+
if isinstance(item, SectionHeaderItem):
|
|
195
|
+
return Heading(level=item.level, text=item.text, bbox=bbox)
|
|
196
|
+
if isinstance(item, TitleItem):
|
|
197
|
+
return Heading(level=1, text=item.text, bbox=bbox)
|
|
198
|
+
if isinstance(item, TextItem):
|
|
199
|
+
return Paragraph(text=item.text, bbox=bbox)
|
|
200
|
+
if isinstance(item, ListItem):
|
|
201
|
+
return Paragraph(text=item.text, bbox=bbox)
|
|
202
|
+
if isinstance(item, CodeItem):
|
|
203
|
+
return Paragraph(text=item.text, bbox=bbox)
|
|
204
|
+
if isinstance(item, FormulaItem):
|
|
205
|
+
return Paragraph(text=item.text, bbox=bbox)
|
|
206
|
+
if isinstance(item, KeyValueItem):
|
|
207
|
+
return Paragraph(text=self._key_value_to_md(item), bbox=bbox)
|
|
208
|
+
# if isinstance(item, FormItem):
|
|
209
|
+
# Made of KeyValueItems, TextItems, etc., skipping for now
|
|
210
|
+
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
def _table_to_block(self, table_item: TableItem, bbox: Optional[BBox]) -> Optional[Table]:
|
|
214
|
+
cells: list[TableCell] = []
|
|
215
|
+
page_height = self._get_page_height(table_item)
|
|
216
|
+
|
|
217
|
+
for cell in table_item.data.table_cells:
|
|
218
|
+
cell_bbox = self._normalize_bbox(cell.bbox, page_height)
|
|
219
|
+
cells.append(
|
|
220
|
+
TableCell(
|
|
221
|
+
text=cell.text,
|
|
222
|
+
row=cell.start_row_offset_idx,
|
|
223
|
+
col=cell.start_col_offset_idx,
|
|
224
|
+
bbox=cell_bbox,
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return Table(cells=tuple(cells), bbox=bbox)
|
|
229
|
+
|
|
230
|
+
def _image_to_block(self, picture_item: PictureItem, bbox: Optional[BBox]) -> Optional[Image]:
|
|
231
|
+
img_ref = picture_item.image
|
|
232
|
+
if img_ref is None:
|
|
233
|
+
ref = picture_item.self_ref
|
|
234
|
+
mime = "application/octet-stream"
|
|
235
|
+
else:
|
|
236
|
+
ref = str(img_ref.uri)
|
|
237
|
+
mime = img_ref.mimetype
|
|
238
|
+
|
|
239
|
+
alt_text = picture_item.caption_text(self.docling_document) or None
|
|
240
|
+
return Image(ref=ref, mime_type=mime, bbox=bbox, alt_text=alt_text)
|
|
241
|
+
|
|
242
|
+
def _get_primary_page(self, item: DocItem) -> Optional[int]:
|
|
243
|
+
if not item.prov:
|
|
244
|
+
return None
|
|
245
|
+
return item.prov[0].page_no
|
|
246
|
+
|
|
247
|
+
def _item_bbox(self, item: DocItem) -> Optional[BBox]:
|
|
248
|
+
if not item.prov:
|
|
249
|
+
return None
|
|
250
|
+
return self._normalize_bbox(item.prov[0].bbox, self._get_page_height(item))
|
|
251
|
+
|
|
252
|
+
def _normalize_bbox(self, bbox: Optional[BoundingBox], page_height: Optional[float]) -> Optional[BBox]:
|
|
253
|
+
if bbox is None:
|
|
254
|
+
return None
|
|
255
|
+
if page_height:
|
|
256
|
+
return bbox.to_top_left_origin(page_height).as_tuple()
|
|
257
|
+
return bbox.as_tuple()
|
|
258
|
+
|
|
259
|
+
def _get_page_height(self, item: DocItem) -> Optional[float]:
|
|
260
|
+
page_no = self._get_primary_page(item)
|
|
261
|
+
if page_no is None:
|
|
262
|
+
return None
|
|
263
|
+
page = self.docling_document.pages.get(page_no)
|
|
264
|
+
if page is None:
|
|
265
|
+
return None
|
|
266
|
+
return page.size.height
|
|
267
|
+
|
|
268
|
+
def _key_value_to_md(self, item: KeyValueItem) -> str:
|
|
269
|
+
# Safely obtain the dumped mapping, then get the first key and its value without indexing dict_keys
|
|
270
|
+
key = ""
|
|
271
|
+
value = ""
|
|
272
|
+
dumped_kv = item.model_dump() if hasattr(item, "model_dump") else {}
|
|
273
|
+
key = next(iter(dumped_kv.keys()), "") or ""
|
|
274
|
+
value = dumped_kv.get(key, "")
|
|
275
|
+
|
|
276
|
+
key = (key or "").strip()
|
|
277
|
+
value = (str(value) if value is not None else "").strip()
|
|
278
|
+
|
|
279
|
+
return f"{key}: {value}"
|
|
280
|
+
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Sequence
|
|
2
|
+
|
|
3
|
+
from clichefactory._engine.models.document_model import Image, Page, Section, Table
|
|
4
|
+
from clichefactory._engine.models.normalized_doc import NormalizedDoc
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EmlDoc(NormalizedDoc):
|
|
8
|
+
summary_text: str
|
|
9
|
+
media_type: str
|
|
10
|
+
pages: Sequence[Page]
|
|
11
|
+
sections: Sequence[Section]
|
|
12
|
+
|
|
13
|
+
# Internal fields
|
|
14
|
+
_plain_text: str
|
|
15
|
+
_markdown: str
|
|
16
|
+
_images: Sequence[Image]
|
|
17
|
+
_tables: Sequence[Table]
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
summary_text: str,
|
|
22
|
+
media_type: str,
|
|
23
|
+
pages: Sequence[Page],
|
|
24
|
+
sections: Sequence[Section],
|
|
25
|
+
images: Sequence[Image],
|
|
26
|
+
tables: Sequence[Table],
|
|
27
|
+
_plain_text: str,
|
|
28
|
+
_markdown: str
|
|
29
|
+
) -> None:
|
|
30
|
+
|
|
31
|
+
self.summary_text = summary_text
|
|
32
|
+
self.media_type = media_type
|
|
33
|
+
self.pages = pages
|
|
34
|
+
self.sections = sections
|
|
35
|
+
self.images = images
|
|
36
|
+
self.tables = tables
|
|
37
|
+
self._plain_text = _plain_text
|
|
38
|
+
self._markdown = _markdown
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_plain_text(self) -> str:
|
|
42
|
+
return self._plain_text
|
|
43
|
+
|
|
44
|
+
def get_markdown(self) -> str:
|
|
45
|
+
return self._markdown
|