clichefactory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. clichefactory/__about__.py +1 -0
  2. clichefactory/__init__.py +42 -0
  3. clichefactory/_config.py +164 -0
  4. clichefactory/_engine/__init__.py +6 -0
  5. clichefactory/_engine/adapters/__init__.py +0 -0
  6. clichefactory/_engine/adapters/csv_adapter.py +139 -0
  7. clichefactory/_engine/adapters/docling_adapter.py +280 -0
  8. clichefactory/_engine/adapters/eml_adapter.py +45 -0
  9. clichefactory/_engine/adapters/vlm_adapter.py +72 -0
  10. clichefactory/_engine/adapters/xlsx_adapter.py +142 -0
  11. clichefactory/_engine/ai_clients/__init__.py +20 -0
  12. clichefactory/_engine/ai_clients/anthropic_client.py +308 -0
  13. clichefactory/_engine/ai_clients/factory.py +104 -0
  14. clichefactory/_engine/ai_clients/gemini_client.py +424 -0
  15. clichefactory/_engine/ai_clients/json_utils.py +150 -0
  16. clichefactory/_engine/ai_clients/ollama_client.py +170 -0
  17. clichefactory/_engine/ai_clients/openai_client.py +257 -0
  18. clichefactory/_engine/ai_clients/prompts.py +21 -0
  19. clichefactory/_engine/ai_clients/protocol.py +71 -0
  20. clichefactory/_engine/ai_clients/tests/__init__.py +0 -0
  21. clichefactory/_engine/ai_clients/tests/test_gemini_usage_phases.py +32 -0
  22. clichefactory/_engine/ai_clients/usage_tracker.py +23 -0
  23. clichefactory/_engine/cache/__init__.py +10 -0
  24. clichefactory/_engine/cache/base_cacher.py +29 -0
  25. clichefactory/_engine/cache/file_system_cacher.py +49 -0
  26. clichefactory/_engine/config/__init__.py +5 -0
  27. clichefactory/_engine/config/base_config.py +58 -0
  28. clichefactory/_engine/contracts/document_metadata.py +24 -0
  29. clichefactory/_engine/contracts/fingerprinting.py +22 -0
  30. clichefactory/_engine/contracts/key_builder.py +47 -0
  31. clichefactory/_engine/contracts/model_schema.py +248 -0
  32. clichefactory/_engine/contracts/operations.py +28 -0
  33. clichefactory/_engine/contracts/payloads/deployment.py +24 -0
  34. clichefactory/_engine/contracts/payloads/inference.py +53 -0
  35. clichefactory/_engine/contracts/payloads/training.py +138 -0
  36. clichefactory/_engine/contracts/validators.py +45 -0
  37. clichefactory/_engine/extractors/__init__.py +12 -0
  38. clichefactory/_engine/metrics/__init__.py +5 -0
  39. clichefactory/_engine/metrics/metrics_config.py +39 -0
  40. clichefactory/_engine/metrics/metrics_config_example.yaml +22 -0
  41. clichefactory/_engine/models/document_model.py +52 -0
  42. clichefactory/_engine/models/normalized_doc.py +60 -0
  43. clichefactory/_engine/models/usage_summary.py +11 -0
  44. clichefactory/_engine/parsers/csv_parser.py +204 -0
  45. clichefactory/_engine/parsers/doc_parser.py +28 -0
  46. clichefactory/_engine/parsers/docling_pipeline_options.py +7 -0
  47. clichefactory/_engine/parsers/docx_parser.py +37 -0
  48. clichefactory/_engine/parsers/eml_parser.py +183 -0
  49. clichefactory/_engine/parsers/fallback_media_parser.py +100 -0
  50. clichefactory/_engine/parsers/image_parser.py +76 -0
  51. clichefactory/_engine/parsers/media_parser.py +88 -0
  52. clichefactory/_engine/parsers/media_parser_registry.py +79 -0
  53. clichefactory/_engine/parsers/parser_utils/eml_utils.py +181 -0
  54. clichefactory/_engine/parsers/parser_utils/image/__init__.py +6 -0
  55. clichefactory/_engine/parsers/parser_utils/image/image_pipeline.py +158 -0
  56. clichefactory/_engine/parsers/parser_utils/image/image_pipeline_options.py +43 -0
  57. clichefactory/_engine/parsers/parser_utils/image/parsers/__init__.py +13 -0
  58. clichefactory/_engine/parsers/parser_utils/image/parsers/docling.py +47 -0
  59. clichefactory/_engine/parsers/parser_utils/image/parsers/ocr_llm.py +74 -0
  60. clichefactory/_engine/parsers/parser_utils/image/parsers/pytesseract.py +47 -0
  61. clichefactory/_engine/parsers/parser_utils/image/parsers/rapidocr.py +47 -0
  62. clichefactory/_engine/parsers/parser_utils/lang_mapping.py +127 -0
  63. clichefactory/_engine/parsers/parser_utils/layout/__init__.py +4 -0
  64. clichefactory/_engine/parsers/parser_utils/media_router.py +30 -0
  65. clichefactory/_engine/parsers/parser_utils/media_type_detector.py +197 -0
  66. clichefactory/_engine/parsers/parser_utils/office_converter.py +125 -0
  67. clichefactory/_engine/parsers/parser_utils/pdf/__init__.py +5 -0
  68. clichefactory/_engine/parsers/parser_utils/pdf/classifier.py +58 -0
  69. clichefactory/_engine/parsers/parser_utils/pdf/docling_helpers.py +448 -0
  70. clichefactory/_engine/parsers/parser_utils/pdf/docling_pipeline_options.py +105 -0
  71. clichefactory/_engine/parsers/parser_utils/pdf/strategies/__init__.py +13 -0
  72. clichefactory/_engine/parsers/parser_utils/pdf/strategies/docling_baseline.py +79 -0
  73. clichefactory/_engine/parsers/parser_utils/pdf/strategies/docling_vlm.py +160 -0
  74. clichefactory/_engine/parsers/parser_utils/pdf/strategies/ocr_llm.py +97 -0
  75. clichefactory/_engine/parsers/parser_utils/pdf/strategies/pymupdf_structured.py +118 -0
  76. clichefactory/_engine/parsers/parser_utils/pdf_repair.py +102 -0
  77. clichefactory/_engine/parsers/parser_utils/prompts.py +97 -0
  78. clichefactory/_engine/parsers/pdf_parser.py +130 -0
  79. clichefactory/_engine/parsers/text_parser.py +51 -0
  80. clichefactory/_engine/parsers/xlsx_parser.py +310 -0
  81. clichefactory/_extract_finalize.py +41 -0
  82. clichefactory/_extract_validation.py +49 -0
  83. clichefactory/_local.py +488 -0
  84. clichefactory/_schema.py +248 -0
  85. clichefactory/_service.py +312 -0
  86. clichefactory/_service_url.py +23 -0
  87. clichefactory/_upload.py +244 -0
  88. clichefactory/_utils.py +146 -0
  89. clichefactory/cli.py +652 -0
  90. clichefactory/cliche.py +305 -0
  91. clichefactory/client.py +364 -0
  92. clichefactory/errors.py +61 -0
  93. clichefactory/types.py +131 -0
  94. clichefactory-0.1.0.dist-info/METADATA +473 -0
  95. clichefactory-0.1.0.dist-info/RECORD +98 -0
  96. clichefactory-0.1.0.dist-info/WHEEL +4 -0
  97. clichefactory-0.1.0.dist-info/entry_points.txt +2 -0
  98. clichefactory-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from clichefactory.__about__ import __version__
4
+ from clichefactory.client import Client, factory
5
+ from clichefactory.cliche import Cliche
6
+ from clichefactory.types import Endpoint, PartialExtraction, ParsingOptions, PostprocessFn
7
+ from clichefactory.errors import (
8
+ AuthenticationError,
9
+ ClicheFactoryError,
10
+ ConfigurationError,
11
+ ExtractionError,
12
+ ParsingError,
13
+ ServiceUnavailableError,
14
+ TrainingError,
15
+ UnsupportedModeError,
16
+ UnsupportedParserError,
17
+ UploadError,
18
+ ValidationError,
19
+ )
20
+
21
+ __all__ = [
22
+ "__version__",
23
+ "AuthenticationError",
24
+ "Cliche",
25
+ "ClicheFactoryError",
26
+ "Client",
27
+ "ConfigurationError",
28
+ "Endpoint",
29
+ "ExtractionError",
30
+ "ParsingError",
31
+ "ParsingOptions",
32
+ "PartialExtraction",
33
+ "PostprocessFn",
34
+ "ServiceUnavailableError",
35
+ "TrainingError",
36
+ "UnsupportedModeError",
37
+ "UnsupportedParserError",
38
+ "UploadError",
39
+ "ValidationError",
40
+ "factory",
41
+ ]
42
+
@@ -0,0 +1,164 @@
1
+ """
2
+ CLI configuration file management.
3
+
4
+ Reads/writes ~/.clichefactory/config.toml. Config precedence (highest first):
5
+ 1. CLI flags
6
+ 2. Environment variables
7
+ 3. Config file (~/.clichefactory/config.toml)
8
+ 4. Defaults
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import tomllib
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+
17
+
18
+ _CONFIG_DIR = Path.home() / ".clichefactory"
19
+ _CONFIG_FILE = _CONFIG_DIR / "config.toml"
20
+
21
+
22
+ @dataclass
23
+ class ServiceConfig:
24
+ api_key: str = ""
25
+ base_url: str = ""
26
+
27
+ @dataclass
28
+ class LocalConfig:
29
+ model: str = ""
30
+ api_key: str = ""
31
+ ocr_model: str = ""
32
+ ocr_api_key: str = ""
33
+
34
+ @dataclass
35
+ class CLIConfig:
36
+ default_mode: str = "service"
37
+ service: ServiceConfig = field(default_factory=ServiceConfig)
38
+ local: LocalConfig = field(default_factory=LocalConfig)
39
+
40
+
41
+ def config_dir() -> Path:
42
+ return _CONFIG_DIR
43
+
44
+
45
+ def config_file_path() -> Path:
46
+ return _CONFIG_FILE
47
+
48
+
49
+ def load_config() -> CLIConfig:
50
+ """Load config from ~/.clichefactory/config.toml. Returns defaults if file doesn't exist."""
51
+ cfg = CLIConfig()
52
+ if not _CONFIG_FILE.is_file():
53
+ return cfg
54
+
55
+ with open(_CONFIG_FILE, "rb") as f:
56
+ data = tomllib.load(f)
57
+
58
+ cfg.default_mode = data.get("default_mode", cfg.default_mode)
59
+
60
+ if "service" in data:
61
+ s = data["service"]
62
+ cfg.service.api_key = s.get("api_key", "")
63
+ cfg.service.base_url = s.get("base_url", "")
64
+
65
+ if "local" in data:
66
+ lo = data["local"]
67
+ cfg.local.model = lo.get("model", "")
68
+ cfg.local.api_key = lo.get("api_key", "")
69
+ cfg.local.ocr_model = lo.get("ocr_model", "")
70
+ cfg.local.ocr_api_key = lo.get("ocr_api_key", "")
71
+
72
+ return cfg
73
+
74
+
75
+ def save_config(cfg: CLIConfig) -> Path:
76
+ """Write config to ~/.clichefactory/config.toml. Returns the path written."""
77
+ _CONFIG_DIR.mkdir(parents=True, exist_ok=True)
78
+
79
+ lines: list[str] = []
80
+ lines.append(f'default_mode = "{cfg.default_mode}"')
81
+ lines.append("")
82
+
83
+ lines.append("[service]")
84
+ lines.append(f'api_key = "{cfg.service.api_key}"')
85
+ if cfg.service.base_url:
86
+ lines.append(f'base_url = "{cfg.service.base_url}"')
87
+ lines.append("")
88
+
89
+ lines.append("[local]")
90
+ lines.append(f'model = "{cfg.local.model}"')
91
+ lines.append(f'api_key = "{cfg.local.api_key}"')
92
+ if cfg.local.ocr_model:
93
+ lines.append(f"")
94
+ lines.append(f"# Optional: separate model for OCR/VLM tasks (image-to-text).")
95
+ lines.append(f"# If not set, the main model is used for everything.")
96
+ lines.append(f"# Only needed when you want a cheaper/faster model for OCR")
97
+ lines.append(f"# while keeping a more capable model for extraction.")
98
+ lines.append(f'ocr_model = "{cfg.local.ocr_model}"')
99
+ if cfg.local.ocr_api_key and cfg.local.ocr_api_key != cfg.local.api_key:
100
+ lines.append(f'ocr_api_key = "{cfg.local.ocr_api_key}"')
101
+ lines.append("")
102
+
103
+ _CONFIG_FILE.write_text("\n".join(lines), encoding="utf-8")
104
+ return _CONFIG_FILE
105
+
106
+
107
+ def resolve_api_key(*, cli_flag: str | None, cfg: CLIConfig) -> str:
108
+ """Resolve ClicheFactory service API key: CLI flag > env > config file."""
109
+ if cli_flag:
110
+ return cli_flag
111
+ env = os.environ.get("CLICHEFACTORY_API_KEY", "")
112
+ if env:
113
+ return env
114
+ return cfg.service.api_key
115
+
116
+
117
+ def resolve_base_url(*, cli_flag: str | None, cfg: CLIConfig) -> str | None:
118
+ """Resolve service base URL: CLI flag > env > config file > None."""
119
+ if cli_flag:
120
+ return cli_flag
121
+ env = os.environ.get("CLICHEFACTORY_API_URL", "")
122
+ if env:
123
+ return env
124
+ return cfg.service.base_url or None
125
+
126
+
127
+ def resolve_model(*, cli_flag: str | None, cfg: CLIConfig) -> str:
128
+ """Resolve LLM model name: CLI flag > env > config file."""
129
+ if cli_flag:
130
+ return cli_flag
131
+ env = os.environ.get("CLICHEFACTORY_LLM_MODEL_NAME") or os.environ.get("LLM_MODEL_NAME", "")
132
+ if env:
133
+ return env
134
+ return cfg.local.model
135
+
136
+
137
+ def resolve_model_api_key(*, cli_flag: str | None, cfg: CLIConfig) -> str:
138
+ """Resolve LLM API key: CLI flag > env > config file."""
139
+ if cli_flag:
140
+ return cli_flag
141
+ env = os.environ.get("CLICHEFACTORY_LLM_API_KEY") or os.environ.get("LLM_API_KEY", "")
142
+ if env:
143
+ return env
144
+ return cfg.local.api_key
145
+
146
+
147
+ def resolve_ocr_model(*, cli_flag: str | None, cfg: CLIConfig) -> str:
148
+ """Resolve OCR model name: CLI flag > env > config file > main model."""
149
+ if cli_flag:
150
+ return cli_flag
151
+ env = os.environ.get("CLICHEFACTORY_OCR_MODEL_NAME") or os.environ.get("OCR_MODEL_NAME", "")
152
+ if env:
153
+ return env
154
+ return cfg.local.ocr_model
155
+
156
+
157
+ def resolve_ocr_api_key(*, cli_flag: str | None, cfg: CLIConfig, model_api_key: str) -> str:
158
+ """Resolve OCR API key: CLI flag > env > config file > main model key."""
159
+ if cli_flag:
160
+ return cli_flag
161
+ env = os.environ.get("CLICHEFACTORY_OCR_API_KEY") or os.environ.get("OCR_API_KEY", "")
162
+ if env:
163
+ return env
164
+ return cfg.local.ocr_api_key or model_api_key
@@ -0,0 +1,6 @@
1
+ """
2
+ Private SDK engine internals.
3
+
4
+ This package contains local parsing and extraction implementation details and
5
+ is not part of the public stable API surface.
6
+ """
File without changes
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Sequence, List, Tuple
4
+
5
+ from clichefactory._engine.models.normalized_doc import NormalizedDoc
6
+ from clichefactory._engine.models.document_model import Page, Section, Heading, Table, Block
7
+
8
+
9
+ class CsvNormalizedDoc(NormalizedDoc):
10
+ def __init__(self, filename: str, blocks: List[Block], markdown: str) -> None:
11
+ self.filename = filename
12
+ self.summary_text = ""
13
+ self.markdown = markdown
14
+
15
+ # CSV is inherently tabular and has no notion of pages. Use a single synthetic page.
16
+ self.pages = (Page(index=1, size=None, blocks=tuple(blocks)),)
17
+ self.sections = self._build_sections_from_headings(self.pages[0].blocks)
18
+
19
+ self.images = tuple()
20
+ self.tables = tuple(b for b in self.pages[0].blocks if isinstance(b, Table))
21
+
22
+ def get_plain_text(self) -> str:
23
+ return self.get_markdown()
24
+
25
+ def get_markdown(self) -> str:
26
+ return self.markdown
27
+
28
+ def get_json(self, table_index: int = 0, header: bool = True) -> Optional[list[dict[str, str]]]:
29
+ """
30
+ Convert the first table in this document to a list of row dicts.
31
+
32
+ - If header=True: uses row 0 as header keys.
33
+ - Missing/empty header cell => "col_{index}"
34
+ - Duplicate header names are disambiguated with _1, _2, ... in left-to-right order.
35
+ - If header=False: uses synthetic keys "col_{index}" and includes row 0 as data.
36
+ """
37
+ if not self.tables:
38
+ return None
39
+
40
+ t = self.tables[table_index]
41
+ if not getattr(t, "cells", None):
42
+ return None
43
+
44
+ max_row = max(c.row for c in t.cells)
45
+ max_col = max(c.col for c in t.cells)
46
+
47
+ grid: list[list[str]] = [[""] * (max_col + 1) for _ in range(max_row + 1)]
48
+ for c in t.cells:
49
+ if 0 <= c.row <= max_row and 0 <= c.col <= max_col:
50
+ grid[c.row][c.col] = (c.text or "").strip()
51
+
52
+ def make_unique_header(raw_keys: list[str]) -> list[str]:
53
+ base_keys: list[str] = []
54
+ counts: dict[str, int] = {}
55
+ for j, k in enumerate(raw_keys):
56
+ base = (k or "").strip()
57
+ if not base:
58
+ base = f"col_{j}"
59
+ base_keys.append(base)
60
+ counts[base] = counts.get(base, 0) + 1
61
+
62
+ seen: dict[str, int] = {}
63
+ unique: list[str] = []
64
+ for base in base_keys:
65
+ if counts.get(base, 0) > 1:
66
+ seen[base] = seen.get(base, 0) + 1
67
+ unique.append(f"{base}_{seen[base]}")
68
+ else:
69
+ unique.append(base)
70
+ return unique
71
+
72
+ if header and grid:
73
+ raw_header = grid[0]
74
+ keys = make_unique_header(raw_header)
75
+ start_row = 1
76
+ else:
77
+ keys = [f"col_{j}" for j in range(max_col + 1)]
78
+ start_row = 0
79
+
80
+ out: list[dict[str, str]] = []
81
+ for r in range(start_row, max_row + 1):
82
+ row_vals = grid[r]
83
+ if all(not (v or "").strip() for v in row_vals):
84
+ continue
85
+ out.append({keys[j]: (row_vals[j] or "") for j in range(len(keys))})
86
+
87
+ return out
88
+
89
+ def _build_sections_from_headings(self, blocks: Sequence[Block]) -> Sequence[Section]:
90
+ class _Builder:
91
+ def __init__(self, heading: Heading) -> None:
92
+ self.heading = heading
93
+ self.blocks: list[Block] = []
94
+ self.children: list[Section] = []
95
+
96
+ def finalize(self) -> Section:
97
+ return Section(
98
+ heading=self.heading,
99
+ blocks=tuple(self.blocks),
100
+ subsections=tuple(self.children),
101
+ )
102
+
103
+ stack: list[Tuple[int, _Builder]] = []
104
+ roots: list[Section] = []
105
+
106
+ def push_section(h: Heading) -> None:
107
+ nonlocal stack, roots
108
+ b = _Builder(h)
109
+ lvl = max(1, int(h.level))
110
+ while stack and stack[-1][0] >= lvl:
111
+ _, closed = stack.pop()
112
+ sec = closed.finalize()
113
+ if stack:
114
+ stack[-1][1].children.append(sec)
115
+ else:
116
+ roots.append(sec)
117
+ stack.append((lvl, b))
118
+
119
+ def add_to_current(block: Block) -> None:
120
+ if not stack:
121
+ push_section(Heading(level=1, text="Document"))
122
+ stack[-1][1].blocks.append(block)
123
+
124
+ for blk in blocks:
125
+ if isinstance(blk, Heading):
126
+ push_section(blk)
127
+ else:
128
+ add_to_current(blk)
129
+
130
+ while stack:
131
+ _, b = stack.pop()
132
+ sec = b.finalize()
133
+ if stack:
134
+ stack[-1][1].children.append(sec)
135
+ else:
136
+ roots.append(sec)
137
+
138
+ return tuple(roots)
139
+
@@ -0,0 +1,280 @@
1
+ from collections import defaultdict
2
+ from typing import DefaultDict, Dict, Literal, Optional, Sequence, Tuple
3
+
4
+ from clichefactory._engine.models.normalized_doc import NormalizedDoc
5
+ from clichefactory._engine.models.document_model import BBox, Block, Heading, Image, Page
6
+ from clichefactory._engine.models.document_model import Paragraph, Section, Table, TableCell
7
+ from docling_core.types.doc.base import BoundingBox
8
+ from docling_core.types.doc.document import (
9
+ DocItem,
10
+ DoclingDocument,
11
+ PictureItem,
12
+ SectionHeaderItem,
13
+ TableItem,
14
+ NodeItem,
15
+ TextItem,
16
+ TitleItem,
17
+ ListItem,
18
+ CodeItem,
19
+ FormulaItem,
20
+ KeyValueItem
21
+ )
22
+
23
+
24
+ class DoclingNormalizedDoc(NormalizedDoc):
25
+ """
26
+ NormalizedDoc from Docling document.
27
+
28
+ output_mode: "markdown" uses docling_document.export_to_markdown();
29
+ "structured" builds markdown from pages/sections/images/tables.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ docling_document: DoclingDocument,
35
+ *,
36
+ output_mode: Literal["markdown", "structured"] = "markdown",
37
+ ) -> None:
38
+ self.docling_document = docling_document
39
+ self._output_mode = output_mode
40
+ self.pages: Sequence[Page] = self.build_pages()
41
+ self.sections: Sequence[Section] = self.build_sections()
42
+
43
+ self.images = tuple(
44
+ block
45
+ for page in self.pages
46
+ for block in page.blocks
47
+ if isinstance(block, Image)
48
+ )
49
+
50
+ self.tables = tuple(
51
+ block
52
+ for page in self.pages
53
+ for block in page.blocks
54
+ if isinstance(block, Table)
55
+ )
56
+
57
+ # ----- Implementation of the NormalizedDoc interface -----
58
+
59
+ def get_plain_text(self) -> str:
60
+ return self.docling_document.export_to_text()
61
+
62
+ def get_markdown(self) -> str:
63
+ if self._output_mode == "markdown":
64
+ return self.docling_document.export_to_markdown()
65
+ from clichefactory._engine.parsers.parser_utils.pdf.docling_helpers import blocks_to_markdown
66
+
67
+ return blocks_to_markdown(self.pages, self.sections, self.images, self.tables)
68
+
69
+ # ----- Docling to document_model mapping functions -----
70
+
71
+ def build_pages(self) -> Sequence[Page]:
72
+ blocks_by_page = self._collect_blocks_by_page()
73
+
74
+ # If docling provides pages, respect them
75
+ if self.docling_document.pages:
76
+ pages: list[Page] = []
77
+ for index, page_item in self.docling_document.pages.items():
78
+ size_tuple = (page_item.size.width, page_item.size.height) if page_item.size else None
79
+ page_blocks = tuple(blocks_by_page.get(index, []))
80
+ pages.append(Page(index=index, size=size_tuple, blocks=page_blocks))
81
+ return tuple(pages)
82
+
83
+ # Otherwise: single synthetic page (DOCX pagination may be unavailable)
84
+ all_blocks: list[Block] = []
85
+ for page_idx in sorted(blocks_by_page.keys()):
86
+ all_blocks.extend(blocks_by_page[page_idx])
87
+ if not all_blocks:
88
+ # last resort: collect in reading order with no page assignment
89
+ for item, _ in self.docling_document.iterate_items(with_groups=False):
90
+ b = self._to_block(item)
91
+ if b is not None:
92
+ all_blocks.append(b)
93
+
94
+ return (Page(index=1, size=None, blocks=tuple(all_blocks)),)
95
+
96
+
97
+ def build_sections(self) -> Sequence[Section]:
98
+ """
99
+ Build a semantic hierarchy from headings.
100
+
101
+ Strategy:
102
+ - Flatten blocks in reading order (page order, then block order).
103
+ - Start a new section when we hit a Heading.
104
+ - Nest based on heading level (H1 contains H2, etc.)
105
+ """
106
+ flat_blocks: list[Block] = []
107
+ # reading order: by page index then appearance
108
+ for page in sorted(self.pages, key=lambda p: p.index):
109
+ flat_blocks.extend(page.blocks)
110
+
111
+ # Collect top-level sections using a stack of (level, SectionBuilder)
112
+ class _Builder:
113
+ def __init__(self, heading: Heading) -> None:
114
+ self.heading = heading
115
+ self.blocks: list[Block] = []
116
+ self.children: list[Section] = []
117
+
118
+ def finalize(self) -> Section:
119
+ return Section(heading=self.heading, blocks=tuple(self.blocks), subsections=tuple(self.children))
120
+
121
+ stack: list[Tuple[int, _Builder]] = []
122
+ roots: list[Section] = []
123
+
124
+ def push_section(h: Heading) -> None:
125
+ nonlocal stack, roots
126
+ b = _Builder(h)
127
+ lvl = max(1, int(h.level))
128
+
129
+ # pop until parent is strictly lower level
130
+ while stack and stack[-1][0] >= lvl:
131
+ closed_lvl, closed = stack.pop()
132
+ sec = closed.finalize()
133
+ if stack:
134
+ stack[-1][1].children.append(sec)
135
+ else:
136
+ roots.append(sec)
137
+
138
+ stack.append((lvl, b))
139
+
140
+ def add_to_current(block: Block) -> None:
141
+ if not stack:
142
+ # No heading seen yet: create an implicit H1 section so content isn’t lost
143
+ push_section(Heading(level=1, text="Document"))
144
+ stack[-1][1].blocks.append(block)
145
+
146
+ for blk in flat_blocks:
147
+ if isinstance(blk, Heading):
148
+ push_section(blk)
149
+ else:
150
+ add_to_current(blk)
151
+
152
+ # close remaining
153
+ while stack:
154
+ lvl, b = stack.pop()
155
+ sec = b.finalize()
156
+ if stack:
157
+ stack[-1][1].children.append(sec)
158
+ else:
159
+ roots.append(sec)
160
+
161
+ return tuple(roots)
162
+
163
+ # Collect all blocks for all pages (poor man's provenance))
164
+ def _collect_blocks_by_page(self) -> Dict[int, list[Block]]:
165
+ blocks: DefaultDict[int, list[Block]] = defaultdict(list)
166
+ for item, _ in self.docling_document.iterate_items(with_groups=False):
167
+ block = self._to_block(item)
168
+
169
+ if block is None:
170
+ continue
171
+
172
+ if not isinstance(item, DocItem): # Could have been a GroupItem or NodeItem or similar semantic, non content descriptor
173
+ continue
174
+
175
+ page_no = self._get_primary_page(item)
176
+ if page_no is None:
177
+ continue
178
+
179
+ blocks[page_no].append(block)
180
+
181
+ return blocks
182
+
183
+
184
+ def _to_block(self, item: NodeItem) -> Optional[Block]:
185
+ if not isinstance(item, DocItem):
186
+ return None
187
+
188
+ bbox = self._item_bbox(item)
189
+
190
+ if isinstance(item, TableItem):
191
+ return self._table_to_block(item, bbox)
192
+ if isinstance(item, PictureItem):
193
+ return self._image_to_block(item, bbox)
194
+ if isinstance(item, SectionHeaderItem):
195
+ return Heading(level=item.level, text=item.text, bbox=bbox)
196
+ if isinstance(item, TitleItem):
197
+ return Heading(level=1, text=item.text, bbox=bbox)
198
+ if isinstance(item, TextItem):
199
+ return Paragraph(text=item.text, bbox=bbox)
200
+ if isinstance(item, ListItem):
201
+ return Paragraph(text=item.text, bbox=bbox)
202
+ if isinstance(item, CodeItem):
203
+ return Paragraph(text=item.text, bbox=bbox)
204
+ if isinstance(item, FormulaItem):
205
+ return Paragraph(text=item.text, bbox=bbox)
206
+ if isinstance(item, KeyValueItem):
207
+ return Paragraph(text=self._key_value_to_md(item), bbox=bbox)
208
+ # if isinstance(item, FormItem):
209
+ # Made of KeyValueItems, TextItems, etc., skipping for now
210
+
211
+ return None
212
+
213
+ def _table_to_block(self, table_item: TableItem, bbox: Optional[BBox]) -> Optional[Table]:
214
+ cells: list[TableCell] = []
215
+ page_height = self._get_page_height(table_item)
216
+
217
+ for cell in table_item.data.table_cells:
218
+ cell_bbox = self._normalize_bbox(cell.bbox, page_height)
219
+ cells.append(
220
+ TableCell(
221
+ text=cell.text,
222
+ row=cell.start_row_offset_idx,
223
+ col=cell.start_col_offset_idx,
224
+ bbox=cell_bbox,
225
+ )
226
+ )
227
+
228
+ return Table(cells=tuple(cells), bbox=bbox)
229
+
230
+ def _image_to_block(self, picture_item: PictureItem, bbox: Optional[BBox]) -> Optional[Image]:
231
+ img_ref = picture_item.image
232
+ if img_ref is None:
233
+ ref = picture_item.self_ref
234
+ mime = "application/octet-stream"
235
+ else:
236
+ ref = str(img_ref.uri)
237
+ mime = img_ref.mimetype
238
+
239
+ alt_text = picture_item.caption_text(self.docling_document) or None
240
+ return Image(ref=ref, mime_type=mime, bbox=bbox, alt_text=alt_text)
241
+
242
+ def _get_primary_page(self, item: DocItem) -> Optional[int]:
243
+ if not item.prov:
244
+ return None
245
+ return item.prov[0].page_no
246
+
247
+ def _item_bbox(self, item: DocItem) -> Optional[BBox]:
248
+ if not item.prov:
249
+ return None
250
+ return self._normalize_bbox(item.prov[0].bbox, self._get_page_height(item))
251
+
252
+ def _normalize_bbox(self, bbox: Optional[BoundingBox], page_height: Optional[float]) -> Optional[BBox]:
253
+ if bbox is None:
254
+ return None
255
+ if page_height:
256
+ return bbox.to_top_left_origin(page_height).as_tuple()
257
+ return bbox.as_tuple()
258
+
259
+ def _get_page_height(self, item: DocItem) -> Optional[float]:
260
+ page_no = self._get_primary_page(item)
261
+ if page_no is None:
262
+ return None
263
+ page = self.docling_document.pages.get(page_no)
264
+ if page is None:
265
+ return None
266
+ return page.size.height
267
+
268
+ def _key_value_to_md(self, item: KeyValueItem) -> str:
269
+ # Safely obtain the dumped mapping, then get the first key and its value without indexing dict_keys
270
+ key = ""
271
+ value = ""
272
+ dumped_kv = item.model_dump() if hasattr(item, "model_dump") else {}
273
+ key = next(iter(dumped_kv.keys()), "") or ""
274
+ value = dumped_kv.get(key, "")
275
+
276
+ key = (key or "").strip()
277
+ value = (str(value) if value is not None else "").strip()
278
+
279
+ return f"{key}: {value}"
280
+
@@ -0,0 +1,45 @@
1
+ from typing import Sequence
2
+
3
+ from clichefactory._engine.models.document_model import Image, Page, Section, Table
4
+ from clichefactory._engine.models.normalized_doc import NormalizedDoc
5
+
6
+
7
+ class EmlDoc(NormalizedDoc):
8
+ summary_text: str
9
+ media_type: str
10
+ pages: Sequence[Page]
11
+ sections: Sequence[Section]
12
+
13
+ # Internal fields
14
+ _plain_text: str
15
+ _markdown: str
16
+ _images: Sequence[Image]
17
+ _tables: Sequence[Table]
18
+
19
+ def __init__(
20
+ self,
21
+ summary_text: str,
22
+ media_type: str,
23
+ pages: Sequence[Page],
24
+ sections: Sequence[Section],
25
+ images: Sequence[Image],
26
+ tables: Sequence[Table],
27
+ _plain_text: str,
28
+ _markdown: str
29
+ ) -> None:
30
+
31
+ self.summary_text = summary_text
32
+ self.media_type = media_type
33
+ self.pages = pages
34
+ self.sections = sections
35
+ self.images = images
36
+ self.tables = tables
37
+ self._plain_text = _plain_text
38
+ self._markdown = _markdown
39
+
40
+
41
+ def get_plain_text(self) -> str:
42
+ return self._plain_text
43
+
44
+ def get_markdown(self) -> str:
45
+ return self._markdown