sayou-refinery 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sayou/refinery/__init__.py +21 -0
- sayou/refinery/core/exceptions.py +1 -1
- sayou/refinery/interfaces/base_normalizer.py +29 -8
- sayou/refinery/interfaces/base_processor.py +29 -9
- sayou/refinery/normalizer/doc_markdown_normalizer.py +107 -39
- sayou/refinery/normalizer/html_text_normalizer.py +36 -10
- sayou/refinery/normalizer/record_normalizer.py +26 -9
- sayou/refinery/pipeline.py +251 -63
- sayou/refinery/processor/deduplicator.py +14 -5
- sayou/refinery/processor/imputer.py +13 -4
- sayou/refinery/processor/outlier_handler.py +11 -4
- sayou/refinery/processor/pii_masker.py +11 -4
- sayou/refinery/processor/text_cleaner.py +13 -4
- {sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/METADATA +6 -6
- sayou_refinery-0.3.3.dist-info/RECORD +16 -0
- sayou/refinery/core/schemas.py +0 -27
- sayou_refinery-0.1.6.dist-info/RECORD +0 -16
- {sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .pipeline import RefineryPipeline
|
|
2
|
+
from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
|
|
3
|
+
from .normalizer.html_text_normalizer import HtmlTextNormalizer
|
|
4
|
+
from .normalizer.record_normalizer import RecordNormalizer
|
|
5
|
+
from .processor.deduplicator import Deduplicator
|
|
6
|
+
from .processor.imputer import Imputer
|
|
7
|
+
from .processor.outlier_handler import OutlierHandler
|
|
8
|
+
from .processor.pii_masker import PiiMasker
|
|
9
|
+
from .processor.text_cleaner import TextCleaner
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"RefineryPipeline",
|
|
13
|
+
"DocMarkdownNormalizer",
|
|
14
|
+
"HtmlTextNormalizer",
|
|
15
|
+
"RecordNormalizer",
|
|
16
|
+
"Deduplicator",
|
|
17
|
+
"Imputer",
|
|
18
|
+
"OutlierHandler",
|
|
19
|
+
"PiiMasker",
|
|
20
|
+
"TextCleaner",
|
|
21
|
+
]
|
|
@@ -11,7 +11,7 @@ class RefineryError(SayouCoreError):
|
|
|
11
11
|
|
|
12
12
|
class NormalizationError(RefineryError):
|
|
13
13
|
"""
|
|
14
|
-
Raised when raw data cannot be converted to
|
|
14
|
+
Raised when raw data cannot be converted to SayouBlocks.
|
|
15
15
|
(e.g., Malformed JSON, Unsupported format)
|
|
16
16
|
"""
|
|
17
17
|
|
|
@@ -3,24 +3,38 @@ from typing import Any, List
|
|
|
3
3
|
|
|
4
4
|
from sayou.core.base_component import BaseComponent
|
|
5
5
|
from sayou.core.decorators import measure_time
|
|
6
|
+
from sayou.core.schemas import SayouBlock
|
|
6
7
|
|
|
7
8
|
from ..core.exceptions import NormalizationError
|
|
8
|
-
from ..core.schemas import ContentBlock
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class BaseNormalizer(BaseComponent):
|
|
12
12
|
"""
|
|
13
|
-
(Tier 1) Abstract base class for converting raw input into
|
|
13
|
+
(Tier 1) Abstract base class for converting raw input into SayouBlock.
|
|
14
14
|
|
|
15
15
|
Normalizers are responsible for structural transformation:
|
|
16
|
-
Raw Data (JSON, HTML, DB Row) -> List[
|
|
16
|
+
Raw Data (JSON, HTML, DB Row) -> List[SayouBlock]
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
component_name = "BaseNormalizer"
|
|
20
20
|
SUPPORTED_TYPES = []
|
|
21
21
|
|
|
22
|
+
@classmethod
|
|
23
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
24
|
+
"""
|
|
25
|
+
Determines if this normalizer can handle the raw input data.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
raw_data: The input data (dict, str, Document object, etc.)
|
|
29
|
+
strategy: Explicit type hint from user (e.g. 'html', 'json')
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
float: Confidence score (0.0 to 1.0)
|
|
33
|
+
"""
|
|
34
|
+
return 0.0
|
|
35
|
+
|
|
22
36
|
@measure_time
|
|
23
|
-
def normalize(self, raw_data: Any) -> List[
|
|
37
|
+
def normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
24
38
|
"""
|
|
25
39
|
Execute the normalization process.
|
|
26
40
|
|
|
@@ -28,20 +42,27 @@ class BaseNormalizer(BaseComponent):
|
|
|
28
42
|
raw_data: The raw input data from Connector or Document.
|
|
29
43
|
|
|
30
44
|
Returns:
|
|
31
|
-
List[
|
|
45
|
+
List[SayouBlock]: A list of normalized content blocks.
|
|
32
46
|
|
|
33
47
|
Raises:
|
|
34
48
|
NormalizationError: If transformation fails.
|
|
35
49
|
"""
|
|
50
|
+
self._emit("on_start", input_data={"type": type(raw_data).__name__})
|
|
51
|
+
|
|
36
52
|
self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
|
|
53
|
+
|
|
37
54
|
try:
|
|
38
55
|
blocks = self._do_normalize(raw_data)
|
|
56
|
+
|
|
57
|
+
self._emit("on_finish", result_data={"blocks": len(blocks)}, success=True)
|
|
58
|
+
|
|
39
59
|
if not isinstance(blocks, list):
|
|
40
60
|
raise NormalizationError(f"Output must be a list, got {type(blocks)}")
|
|
41
61
|
|
|
42
62
|
return blocks
|
|
43
63
|
|
|
44
64
|
except Exception as e:
|
|
65
|
+
self._emit("on_error", error=e)
|
|
45
66
|
wrapped_error = NormalizationError(
|
|
46
67
|
f"[{self.component_name}] Failed: {str(e)}"
|
|
47
68
|
)
|
|
@@ -49,14 +70,14 @@ class BaseNormalizer(BaseComponent):
|
|
|
49
70
|
raise wrapped_error
|
|
50
71
|
|
|
51
72
|
@abstractmethod
|
|
52
|
-
def _do_normalize(self, raw_data: Any) -> List[
|
|
73
|
+
def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
53
74
|
"""
|
|
54
|
-
[Abstract Hook] Implement logic to convert specific raw format to
|
|
75
|
+
[Abstract Hook] Implement logic to convert specific raw format to SayouBlocks.
|
|
55
76
|
|
|
56
77
|
Args:
|
|
57
78
|
raw_data: The raw input.
|
|
58
79
|
|
|
59
80
|
Returns:
|
|
60
|
-
List[
|
|
81
|
+
List[SayouBlock]: The standardized blocks.
|
|
61
82
|
"""
|
|
62
83
|
raise NotImplementedError
|
|
@@ -3,14 +3,14 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from sayou.core.base_component import BaseComponent
|
|
5
5
|
from sayou.core.decorators import measure_time
|
|
6
|
+
from sayou.core.schemas import SayouBlock
|
|
6
7
|
|
|
7
8
|
from ..core.exceptions import ProcessingError
|
|
8
|
-
from ..core.schemas import ContentBlock
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class BaseProcessor(BaseComponent):
|
|
12
12
|
"""
|
|
13
|
-
(Tier 1) Abstract base class for processing/cleaning
|
|
13
|
+
(Tier 1) Abstract base class for processing/cleaning SayouBlock.
|
|
14
14
|
|
|
15
15
|
Processors operate on data that is already normalized. They can modify content
|
|
16
16
|
(e.g., PII masking, Imputation) or filter out blocks (e.g., Deduplication).
|
|
@@ -18,40 +18,60 @@ class BaseProcessor(BaseComponent):
|
|
|
18
18
|
|
|
19
19
|
component_name = "BaseProcessor"
|
|
20
20
|
|
|
21
|
+
@classmethod
|
|
22
|
+
def can_handle(cls, blocks: List[SayouBlock]) -> float:
|
|
23
|
+
"""
|
|
24
|
+
Processors are usually explicitly chained, but this allows for
|
|
25
|
+
future smart-selection (e.g., auto-detecting PII).
|
|
26
|
+
"""
|
|
27
|
+
if (
|
|
28
|
+
isinstance(blocks, list)
|
|
29
|
+
and len(blocks) > 0
|
|
30
|
+
and isinstance(blocks[0], SayouBlock)
|
|
31
|
+
):
|
|
32
|
+
return 0.5
|
|
33
|
+
return 0.0
|
|
34
|
+
|
|
21
35
|
@measure_time
|
|
22
|
-
def process(self, blocks: List[
|
|
36
|
+
def process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
23
37
|
"""
|
|
24
38
|
Execute the processing logic on a list of blocks.
|
|
25
39
|
|
|
26
40
|
Args:
|
|
27
|
-
blocks: Input list of
|
|
41
|
+
blocks: Input list of SayouBlocks.
|
|
28
42
|
|
|
29
43
|
Returns:
|
|
30
|
-
List[
|
|
44
|
+
List[SayouBlock]: Processed list of SayouBlocks.
|
|
31
45
|
|
|
32
46
|
Raises:
|
|
33
47
|
ProcessingError: If processing logic fails.
|
|
34
48
|
"""
|
|
49
|
+
self._emit("on_start", input_data={"blocks": len(blocks)})
|
|
35
50
|
try:
|
|
36
51
|
if not blocks:
|
|
37
52
|
return []
|
|
38
53
|
|
|
39
|
-
|
|
54
|
+
result = self._do_process(blocks)
|
|
55
|
+
|
|
56
|
+
self._emit("on_finish", result_data={"blocks": len(result)}, success=True)
|
|
57
|
+
|
|
58
|
+
return result
|
|
40
59
|
|
|
41
60
|
except Exception as e:
|
|
61
|
+
self._emit("on_error", error=e)
|
|
42
62
|
wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
|
|
43
63
|
self.logger.error(wrapped_error, exc_info=True)
|
|
44
64
|
raise wrapped_error
|
|
45
65
|
|
|
46
66
|
@abstractmethod
|
|
47
|
-
def _do_process(self, blocks: List[
|
|
67
|
+
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
48
68
|
"""
|
|
49
69
|
[Abstract Hook] Implement cleaning/filtering logic.
|
|
50
70
|
|
|
51
71
|
Args:
|
|
52
|
-
blocks: List of input
|
|
72
|
+
blocks: List of input SayouBlocks.
|
|
53
73
|
|
|
54
74
|
Returns:
|
|
55
|
-
List[
|
|
75
|
+
List[SayouBlock]: Modified list of SayouBlocks.
|
|
56
76
|
"""
|
|
57
77
|
raise NotImplementedError
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
+
from sayou.core.registry import register_component
|
|
4
|
+
from sayou.core.schemas import SayouBlock
|
|
5
|
+
|
|
3
6
|
from ..core.exceptions import NormalizationError
|
|
4
|
-
from ..core.schemas import ContentBlock
|
|
5
7
|
from ..interfaces.base_normalizer import BaseNormalizer
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@register_component("normalizer")
|
|
8
11
|
class DocMarkdownNormalizer(BaseNormalizer):
|
|
9
12
|
"""
|
|
10
|
-
(Tier 2) Normalizes a Sayou Document Dictionary into Markdown
|
|
13
|
+
(Tier 2) Normalizes a Sayou Document Dictionary into Markdown SayouBlocks.
|
|
11
14
|
|
|
12
15
|
This engine parses the structured dictionary output from 'sayou-document' and
|
|
13
16
|
converts individual elements (Text, Table, Image, Chart) into semantically
|
|
@@ -17,6 +20,24 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
17
20
|
component_name = "DocMarkdownNormalizer"
|
|
18
21
|
SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
|
|
19
22
|
|
|
23
|
+
@classmethod
|
|
24
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
25
|
+
if strategy in ["markdown", "standard_doc"]:
|
|
26
|
+
return 1.0
|
|
27
|
+
|
|
28
|
+
if hasattr(raw_data, "doc_type") and hasattr(raw_data, "pages"):
|
|
29
|
+
return 1.0
|
|
30
|
+
|
|
31
|
+
if isinstance(raw_data, str):
|
|
32
|
+
if any(
|
|
33
|
+
line.strip().startswith(("#", "-", "* "))
|
|
34
|
+
for line in raw_data.splitlines()[:10]
|
|
35
|
+
):
|
|
36
|
+
return 0.8
|
|
37
|
+
return 0.1
|
|
38
|
+
|
|
39
|
+
return 0.0
|
|
40
|
+
|
|
20
41
|
def initialize(
|
|
21
42
|
self,
|
|
22
43
|
include_headers: bool = True,
|
|
@@ -35,7 +56,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
35
56
|
self.include_headers = include_headers
|
|
36
57
|
self.include_footers = include_footers
|
|
37
58
|
|
|
38
|
-
def _do_normalize(self, raw_data: Any) -> List[
|
|
59
|
+
def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
39
60
|
"""
|
|
40
61
|
Execute the normalization logic on the document dictionary.
|
|
41
62
|
|
|
@@ -43,47 +64,94 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
43
64
|
raw_data (Any): The input dictionary adhering to Sayou Document Schema.
|
|
44
65
|
|
|
45
66
|
Returns:
|
|
46
|
-
List[
|
|
67
|
+
List[SayouBlock]: A list of normalized content blocks (mostly 'md' type).
|
|
47
68
|
|
|
48
69
|
Raises:
|
|
49
70
|
NormalizationError: If `raw_data` is not a valid dictionary.
|
|
50
71
|
"""
|
|
51
|
-
|
|
72
|
+
# 1. Input Handling (Dict/Object/Str Safe Conversion)
|
|
73
|
+
if isinstance(raw_data, str):
|
|
74
|
+
return [SayouBlock(type="md", content=raw_data, metadata={})]
|
|
75
|
+
|
|
76
|
+
# Handle Pydantic models or objects safely
|
|
77
|
+
if hasattr(raw_data, "model_dump"):
|
|
78
|
+
doc_data = raw_data.model_dump()
|
|
79
|
+
elif hasattr(raw_data, "dict"):
|
|
80
|
+
doc_data = raw_data.dict()
|
|
81
|
+
elif hasattr(raw_data, "__dict__"):
|
|
82
|
+
doc_data = raw_data.__dict__
|
|
83
|
+
elif isinstance(raw_data, dict):
|
|
84
|
+
doc_data = raw_data
|
|
85
|
+
else:
|
|
52
86
|
raise NormalizationError(
|
|
53
|
-
f"Input must be
|
|
87
|
+
f"Input must be convertible to Dictionary, got {type(raw_data).__name__}"
|
|
54
88
|
)
|
|
55
89
|
|
|
56
|
-
|
|
57
|
-
blocks: List[ContentBlock] = []
|
|
90
|
+
normalized_blocks: List[SayouBlock] = []
|
|
58
91
|
|
|
59
|
-
|
|
60
|
-
blocks.extend(self._handle_doc_metadata(doc_data))
|
|
92
|
+
doc_meta = doc_data.get("metadata", {})
|
|
61
93
|
|
|
62
|
-
|
|
63
|
-
if
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
94
|
+
def sanitize_text(text: str) -> str:
|
|
95
|
+
if not text:
|
|
96
|
+
return ""
|
|
97
|
+
text = text.replace("\x0b", "\n")
|
|
98
|
+
text = text.replace("\r", "\n")
|
|
99
|
+
text = text.replace("\f", "\n")
|
|
100
|
+
return text
|
|
68
101
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
102
|
+
# 2. Iterate Pages
|
|
103
|
+
for page in doc_data.get("pages", []):
|
|
104
|
+
page_content_buffer = []
|
|
105
|
+
page_num = page.get("page_index", 0)
|
|
106
|
+
|
|
107
|
+
# Helper to extract text from elements using existing logic
|
|
108
|
+
def collect_text(elements, is_header=False, is_footer=False):
|
|
109
|
+
if not elements:
|
|
110
|
+
return
|
|
111
|
+
for element in elements:
|
|
112
|
+
sub_blocks = self._handle_element(element, is_header, is_footer)
|
|
113
|
+
for sb in sub_blocks:
|
|
114
|
+
if sb.content and sb.content.strip():
|
|
115
|
+
clean_content = sanitize_text(sb.content.strip())
|
|
116
|
+
page_content_buffer.append(clean_content)
|
|
117
|
+
|
|
118
|
+
# A. Header Elements
|
|
119
|
+
if self.include_headers:
|
|
120
|
+
collect_text(page.get("header_elements", []), is_header=True)
|
|
121
|
+
|
|
122
|
+
# B. Body Elements (Main Content)
|
|
123
|
+
collect_text(page.get("elements", []), is_header=False)
|
|
124
|
+
|
|
125
|
+
# C. Footer Elements
|
|
126
|
+
if self.include_footers:
|
|
127
|
+
collect_text(page.get("footer_elements", []), is_footer=True)
|
|
128
|
+
|
|
129
|
+
# 3. Aggregate: Create ONE Block per Page
|
|
130
|
+
if page_content_buffer:
|
|
131
|
+
full_page_text = "\n\n".join(page_content_buffer)
|
|
132
|
+
|
|
133
|
+
block_meta = doc_meta.copy()
|
|
134
|
+
block_meta.update(
|
|
135
|
+
{
|
|
136
|
+
"page_num": page_num,
|
|
137
|
+
"origin_type": "page_aggregated",
|
|
138
|
+
"source": doc_meta.get("filename", "unknown"),
|
|
139
|
+
}
|
|
72
140
|
)
|
|
73
141
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
self._handle_element(element, is_header=False, is_footer=True)
|
|
142
|
+
normalized_blocks.append(
|
|
143
|
+
SayouBlock(
|
|
144
|
+
type="md",
|
|
145
|
+
content=full_page_text,
|
|
146
|
+
metadata=block_meta,
|
|
80
147
|
)
|
|
148
|
+
)
|
|
81
149
|
|
|
82
|
-
return
|
|
150
|
+
return normalized_blocks
|
|
83
151
|
|
|
84
152
|
def _handle_element(
|
|
85
153
|
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
86
|
-
) -> List[
|
|
154
|
+
) -> List[SayouBlock]:
|
|
87
155
|
"""
|
|
88
156
|
Dispatch the element to specific handlers based on its 'type' field.
|
|
89
157
|
|
|
@@ -93,7 +161,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
93
161
|
is_footer (bool): True if the element is part of the page footer.
|
|
94
162
|
|
|
95
163
|
Returns:
|
|
96
|
-
List[
|
|
164
|
+
List[SayouBlock]: The resulting block(s) from the element.
|
|
97
165
|
"""
|
|
98
166
|
if is_footer and not self.include_footers:
|
|
99
167
|
return []
|
|
@@ -114,7 +182,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
114
182
|
|
|
115
183
|
return []
|
|
116
184
|
|
|
117
|
-
def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[
|
|
185
|
+
def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[SayouBlock]:
|
|
118
186
|
"""
|
|
119
187
|
Convert document-level metadata into a Markdown Frontmatter block.
|
|
120
188
|
|
|
@@ -122,7 +190,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
122
190
|
doc_data (Dict[str, Any]): The root document dictionary containing 'metadata'.
|
|
123
191
|
|
|
124
192
|
Returns:
|
|
125
|
-
List[
|
|
193
|
+
List[SayouBlock]: A single block containing YAML-like frontmatter.
|
|
126
194
|
"""
|
|
127
195
|
md_frontmatter = "---\n"
|
|
128
196
|
metadata = doc_data.get("metadata", {})
|
|
@@ -137,7 +205,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
137
205
|
md_frontmatter += "---\n\n"
|
|
138
206
|
|
|
139
207
|
return [
|
|
140
|
-
|
|
208
|
+
SayouBlock(
|
|
141
209
|
type="md",
|
|
142
210
|
content=md_frontmatter,
|
|
143
211
|
metadata={"page_num": 0, "id": "metadata", "is_footer": False},
|
|
@@ -146,7 +214,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
146
214
|
|
|
147
215
|
def _handle_text(
|
|
148
216
|
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
149
|
-
) -> List[
|
|
217
|
+
) -> List[SayouBlock]:
|
|
150
218
|
"""
|
|
151
219
|
Convert a text element to a Markdown block, handling headings and lists.
|
|
152
220
|
|
|
@@ -183,7 +251,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
183
251
|
content = text
|
|
184
252
|
|
|
185
253
|
return [
|
|
186
|
-
|
|
254
|
+
SayouBlock(
|
|
187
255
|
type="md",
|
|
188
256
|
content=content,
|
|
189
257
|
metadata={
|
|
@@ -197,7 +265,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
197
265
|
|
|
198
266
|
def _handle_table(
|
|
199
267
|
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
200
|
-
) -> List[
|
|
268
|
+
) -> List[SayouBlock]:
|
|
201
269
|
"""
|
|
202
270
|
Convert a table element into a Markdown table representation.
|
|
203
271
|
|
|
@@ -232,7 +300,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
232
300
|
md_table += "| " + " | ".join(body_cells) + " |\n"
|
|
233
301
|
|
|
234
302
|
return [
|
|
235
|
-
|
|
303
|
+
SayouBlock(
|
|
236
304
|
type="md",
|
|
237
305
|
content=md_table.strip(),
|
|
238
306
|
metadata={
|
|
@@ -245,7 +313,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
245
313
|
|
|
246
314
|
def _handle_image(
|
|
247
315
|
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
248
|
-
) -> List[
|
|
316
|
+
) -> List[SayouBlock]:
|
|
249
317
|
"""
|
|
250
318
|
Process an image element.
|
|
251
319
|
|
|
@@ -266,7 +334,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
266
334
|
img_format = element.get("image_format", "png")
|
|
267
335
|
|
|
268
336
|
return [
|
|
269
|
-
|
|
337
|
+
SayouBlock(
|
|
270
338
|
type="image_base64",
|
|
271
339
|
content=image_base64,
|
|
272
340
|
metadata={
|
|
@@ -281,7 +349,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
281
349
|
|
|
282
350
|
def _handle_chart(
|
|
283
351
|
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
284
|
-
) -> List[
|
|
352
|
+
) -> List[SayouBlock]:
|
|
285
353
|
"""
|
|
286
354
|
Convert a chart element into its text representation.
|
|
287
355
|
|
|
@@ -295,7 +363,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
295
363
|
content = f"--- Chart Data ---\n{text_rep}\n--------------------\n"
|
|
296
364
|
|
|
297
365
|
return [
|
|
298
|
-
|
|
366
|
+
SayouBlock(
|
|
299
367
|
type="md",
|
|
300
368
|
content=content,
|
|
301
369
|
metadata={
|
|
@@ -5,14 +5,17 @@ except ImportError:
|
|
|
5
5
|
|
|
6
6
|
from typing import Any, List
|
|
7
7
|
|
|
8
|
+
from sayou.core.registry import register_component
|
|
9
|
+
from sayou.core.schemas import SayouBlock
|
|
10
|
+
|
|
8
11
|
from ..core.exceptions import NormalizationError
|
|
9
|
-
from ..core.schemas import ContentBlock
|
|
10
12
|
from ..interfaces.base_normalizer import BaseNormalizer
|
|
11
13
|
|
|
12
14
|
|
|
15
|
+
@register_component("normalizer")
|
|
13
16
|
class HtmlTextNormalizer(BaseNormalizer):
|
|
14
17
|
"""
|
|
15
|
-
(Tier 2) Converts HTML string into a clean Text
|
|
18
|
+
(Tier 2) Converts HTML string into a clean Text SayouBlock.
|
|
16
19
|
|
|
17
20
|
Uses BeautifulSoup to strip tags, scripts, and styles, returning only
|
|
18
21
|
the visible text content while preserving paragraph structure.
|
|
@@ -21,7 +24,20 @@ class HtmlTextNormalizer(BaseNormalizer):
|
|
|
21
24
|
component_name = "HtmlTextNormalizer"
|
|
22
25
|
SUPPORTED_TYPES = ["html"]
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
@classmethod
|
|
28
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
29
|
+
if strategy in ["html"]:
|
|
30
|
+
return 1.0
|
|
31
|
+
|
|
32
|
+
if isinstance(raw_data, str):
|
|
33
|
+
sample = raw_data[:1000].lower()
|
|
34
|
+
if "<html" in sample or "<!doctype html" in sample:
|
|
35
|
+
return 1.0
|
|
36
|
+
if "<body" in sample or "<div" in sample:
|
|
37
|
+
return 0.95
|
|
38
|
+
return 0.0
|
|
39
|
+
|
|
40
|
+
def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
25
41
|
"""
|
|
26
42
|
Parse HTML and extract text.
|
|
27
43
|
|
|
@@ -29,7 +45,7 @@ class HtmlTextNormalizer(BaseNormalizer):
|
|
|
29
45
|
raw_data (Any): The input HTML string.
|
|
30
46
|
|
|
31
47
|
Returns:
|
|
32
|
-
List[
|
|
48
|
+
List[SayouBlock]: A single block of type 'text'.
|
|
33
49
|
|
|
34
50
|
Raises:
|
|
35
51
|
ImportError: If BeautifulSoup4 is not installed.
|
|
@@ -45,15 +61,25 @@ class HtmlTextNormalizer(BaseNormalizer):
|
|
|
45
61
|
|
|
46
62
|
soup = BeautifulSoup(raw_data, "html.parser")
|
|
47
63
|
|
|
48
|
-
|
|
64
|
+
extracted_meta = {"strategy": "html_parsed"}
|
|
65
|
+
|
|
66
|
+
if soup.title and soup.title.string:
|
|
67
|
+
extracted_meta["title"] = soup.title.string.strip()
|
|
68
|
+
extracted_meta["subject"] = soup.title.string.strip()
|
|
69
|
+
|
|
70
|
+
for meta_tag in soup.find_all("meta"):
|
|
71
|
+
name = meta_tag.get("name") or meta_tag.get("property")
|
|
72
|
+
content = meta_tag.get("content")
|
|
73
|
+
if name and content:
|
|
74
|
+
extracted_meta[name] = content
|
|
75
|
+
|
|
76
|
+
for tag in soup(["script", "style", "noscript", "iframe", "head"]):
|
|
49
77
|
tag.extract()
|
|
50
78
|
|
|
51
|
-
|
|
79
|
+
text_content = soup.get_text(separator="\n")
|
|
52
80
|
|
|
53
81
|
import re
|
|
54
82
|
|
|
55
|
-
|
|
83
|
+
text_content = re.sub(r"\n{3,}", "\n\n", text_content).strip()
|
|
56
84
|
|
|
57
|
-
return [
|
|
58
|
-
ContentBlock(type="text", content=text, metadata={"source_type": "html"})
|
|
59
|
-
]
|
|
85
|
+
return [SayouBlock(type="text", content=text_content, metadata=extracted_meta)]
|
|
@@ -1,22 +1,39 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
+
from sayou.core.registry import register_component
|
|
4
|
+
from sayou.core.schemas import SayouBlock
|
|
5
|
+
|
|
3
6
|
from ..core.exceptions import NormalizationError
|
|
4
|
-
from ..core.schemas import ContentBlock
|
|
5
7
|
from ..interfaces.base_normalizer import BaseNormalizer
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@register_component("normalizer")
|
|
8
11
|
class RecordNormalizer(BaseNormalizer):
|
|
9
12
|
"""
|
|
10
|
-
(Tier 2) Converts structured data (Dict/List) into 'record'
|
|
13
|
+
(Tier 2) Converts structured data (Dict/List) into 'record' SayouBlocks.
|
|
11
14
|
|
|
12
15
|
Suitable for processing database rows, CSV records, or JSON API responses.
|
|
13
|
-
Each dictionary becomes a separate
|
|
16
|
+
Each dictionary becomes a separate SayouBlock of type 'record'.
|
|
14
17
|
"""
|
|
15
18
|
|
|
16
19
|
component_name = "RecordNormalizer"
|
|
17
20
|
SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
|
|
18
21
|
|
|
19
|
-
|
|
22
|
+
@classmethod
|
|
23
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
24
|
+
if strategy in ["json", "record", "db", "dict"]:
|
|
25
|
+
return 1.0
|
|
26
|
+
|
|
27
|
+
if isinstance(raw_data, dict):
|
|
28
|
+
return 0.9
|
|
29
|
+
if isinstance(raw_data, list):
|
|
30
|
+
if len(raw_data) > 0 and isinstance(raw_data[0], dict):
|
|
31
|
+
return 0.9
|
|
32
|
+
return 0.1
|
|
33
|
+
|
|
34
|
+
return 0.0
|
|
35
|
+
|
|
36
|
+
def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
20
37
|
"""
|
|
21
38
|
Convert dict or list of dicts into record blocks.
|
|
22
39
|
|
|
@@ -24,7 +41,7 @@ class RecordNormalizer(BaseNormalizer):
|
|
|
24
41
|
raw_data (Any): A Dictionary or a List of Dictionaries.
|
|
25
42
|
|
|
26
43
|
Returns:
|
|
27
|
-
List[
|
|
44
|
+
List[SayouBlock]: Blocks of type 'record'.
|
|
28
45
|
"""
|
|
29
46
|
blocks = []
|
|
30
47
|
|
|
@@ -49,17 +66,17 @@ class RecordNormalizer(BaseNormalizer):
|
|
|
49
66
|
|
|
50
67
|
return blocks
|
|
51
68
|
|
|
52
|
-
def _create_block(self, data: Dict[str, Any]) ->
|
|
69
|
+
def _create_block(self, data: Dict[str, Any]) -> SayouBlock:
|
|
53
70
|
"""
|
|
54
|
-
Helper to wrap a single dictionary into a
|
|
71
|
+
Helper to wrap a single dictionary into a SayouBlock.
|
|
55
72
|
|
|
56
73
|
Args:
|
|
57
74
|
data (Dict[str, Any]): The data record.
|
|
58
75
|
|
|
59
76
|
Returns:
|
|
60
|
-
|
|
77
|
+
SayouBlock: A block with type='record' and content=data.
|
|
61
78
|
"""
|
|
62
|
-
return
|
|
79
|
+
return SayouBlock(
|
|
63
80
|
type="record",
|
|
64
81
|
content=data,
|
|
65
82
|
metadata={"fields": list(data.keys())},
|