sayou-refinery 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sayou/refinery/core/exceptions.py +27 -0
- sayou/refinery/core/schemas.py +27 -0
- sayou/refinery/interfaces/base_normalizer.py +62 -0
- sayou/refinery/interfaces/base_processor.py +57 -0
- sayou/refinery/normalizer/doc_markdown_normalizer.py +307 -0
- sayou/refinery/normalizer/html_text_normalizer.py +59 -0
- sayou/refinery/normalizer/record_normalizer.py +66 -0
- sayou/refinery/pipeline.py +111 -0
- sayou/refinery/processor/deduplicator.py +48 -0
- sayou/refinery/processor/imputer.py +51 -0
- sayou/refinery/processor/outlier_handler.py +84 -0
- sayou/refinery/processor/pii_masker.py +52 -0
- sayou/refinery/processor/text_cleaner.py +61 -0
- sayou_refinery-0.1.6.dist-info/METADATA +310 -0
- sayou_refinery-0.1.6.dist-info/RECORD +16 -0
- sayou_refinery-0.1.6.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from sayou.core.exceptions import SayouCoreError
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RefineryError(SayouCoreError):
|
|
5
|
+
"""
|
|
6
|
+
Base exception for all errors within the sayou-refinery toolkit.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NormalizationError(RefineryError):
|
|
13
|
+
"""
|
|
14
|
+
Raised when raw data cannot be converted to ContentBlocks.
|
|
15
|
+
(e.g., Malformed JSON, Unsupported format)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ProcessingError(RefineryError):
|
|
22
|
+
"""
|
|
23
|
+
Raised when a processor fails to clean or transform blocks.
|
|
24
|
+
(e.g., PII masking failure, Imputation rule error)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
pass
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ContentBlock(BaseModel):
|
|
7
|
+
"""
|
|
8
|
+
Standard unit of content refined from raw data.
|
|
9
|
+
|
|
10
|
+
Refinery normalizes raw inputs into a list of these blocks.
|
|
11
|
+
Processors iterate over these blocks to clean or modify them.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
type: str = Field(
|
|
15
|
+
..., description="Block type (e.g., 'text', 'md', 'record', 'table')"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
content: Union[str, Dict[str, Any], List[Any]] = Field(
|
|
19
|
+
..., description="The actual data payload"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
metadata: Dict[str, Any] = Field(
|
|
23
|
+
default_factory=dict, description="Context info (page_num, source_id, etc.)"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
class Config:
|
|
27
|
+
arbitrary_types_allowed = True
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import Any, List
|
|
3
|
+
|
|
4
|
+
from sayou.core.base_component import BaseComponent
|
|
5
|
+
from sayou.core.decorators import measure_time
|
|
6
|
+
|
|
7
|
+
from ..core.exceptions import NormalizationError
|
|
8
|
+
from ..core.schemas import ContentBlock
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseNormalizer(BaseComponent):
|
|
12
|
+
"""
|
|
13
|
+
(Tier 1) Abstract base class for converting raw input into ContentBlocks.
|
|
14
|
+
|
|
15
|
+
Normalizers are responsible for structural transformation:
|
|
16
|
+
Raw Data (JSON, HTML, DB Row) -> List[ContentBlock]
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
component_name = "BaseNormalizer"
|
|
20
|
+
SUPPORTED_TYPES = []
|
|
21
|
+
|
|
22
|
+
@measure_time
|
|
23
|
+
def normalize(self, raw_data: Any) -> List[ContentBlock]:
|
|
24
|
+
"""
|
|
25
|
+
Execute the normalization process.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
raw_data: The raw input data from Connector or Document.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List[ContentBlock]: A list of normalized content blocks.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
NormalizationError: If transformation fails.
|
|
35
|
+
"""
|
|
36
|
+
self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
|
|
37
|
+
try:
|
|
38
|
+
blocks = self._do_normalize(raw_data)
|
|
39
|
+
if not isinstance(blocks, list):
|
|
40
|
+
raise NormalizationError(f"Output must be a list, got {type(blocks)}")
|
|
41
|
+
|
|
42
|
+
return blocks
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
wrapped_error = NormalizationError(
|
|
46
|
+
f"[{self.component_name}] Failed: {str(e)}"
|
|
47
|
+
)
|
|
48
|
+
self.logger.error(wrapped_error, exc_info=True)
|
|
49
|
+
raise wrapped_error
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
|
|
53
|
+
"""
|
|
54
|
+
[Abstract Hook] Implement logic to convert specific raw format to ContentBlocks.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
raw_data: The raw input.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List[ContentBlock]: The standardized blocks.
|
|
61
|
+
"""
|
|
62
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from sayou.core.base_component import BaseComponent
|
|
5
|
+
from sayou.core.decorators import measure_time
|
|
6
|
+
|
|
7
|
+
from ..core.exceptions import ProcessingError
|
|
8
|
+
from ..core.schemas import ContentBlock
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseProcessor(BaseComponent):
|
|
12
|
+
"""
|
|
13
|
+
(Tier 1) Abstract base class for processing/cleaning ContentBlocks.
|
|
14
|
+
|
|
15
|
+
Processors operate on data that is already normalized. They can modify content
|
|
16
|
+
(e.g., PII masking, Imputation) or filter out blocks (e.g., Deduplication).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
component_name = "BaseProcessor"
|
|
20
|
+
|
|
21
|
+
@measure_time
|
|
22
|
+
def process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
23
|
+
"""
|
|
24
|
+
Execute the processing logic on a list of blocks.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
blocks: Input list of ContentBlocks.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List[ContentBlock]: Processed list of ContentBlocks.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ProcessingError: If processing logic fails.
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
if not blocks:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
return self._do_process(blocks)
|
|
40
|
+
|
|
41
|
+
except Exception as e:
|
|
42
|
+
wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
|
|
43
|
+
self.logger.error(wrapped_error, exc_info=True)
|
|
44
|
+
raise wrapped_error
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
48
|
+
"""
|
|
49
|
+
[Abstract Hook] Implement cleaning/filtering logic.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
blocks: List of input ContentBlocks.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List[ContentBlock]: Modified list of ContentBlocks.
|
|
56
|
+
"""
|
|
57
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from ..core.exceptions import NormalizationError
|
|
4
|
+
from ..core.schemas import ContentBlock
|
|
5
|
+
from ..interfaces.base_normalizer import BaseNormalizer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocMarkdownNormalizer(BaseNormalizer):
|
|
9
|
+
"""
|
|
10
|
+
(Tier 2) Normalizes a Sayou Document Dictionary into Markdown ContentBlocks.
|
|
11
|
+
|
|
12
|
+
This engine parses the structured dictionary output from 'sayou-document' and
|
|
13
|
+
converts individual elements (Text, Table, Image, Chart) into semantically
|
|
14
|
+
rich Markdown blocks. It also handles metadata conversion to Frontmatter.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
component_name = "DocMarkdownNormalizer"
|
|
18
|
+
SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
|
|
19
|
+
|
|
20
|
+
def initialize(
|
|
21
|
+
self,
|
|
22
|
+
include_headers: bool = True,
|
|
23
|
+
include_footers: bool = False,
|
|
24
|
+
**kwargs,
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Configure the normalizer's behavior regarding document structure.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
include_headers (bool): If True, processes elements found in page headers.
|
|
31
|
+
include_footers (bool): If True, processes elements found in page footers.
|
|
32
|
+
**kwargs: Additional configuration parameters passed to parent.
|
|
33
|
+
"""
|
|
34
|
+
super().initialize(**kwargs)
|
|
35
|
+
self.include_headers = include_headers
|
|
36
|
+
self.include_footers = include_footers
|
|
37
|
+
|
|
38
|
+
def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
|
|
39
|
+
"""
|
|
40
|
+
Execute the normalization logic on the document dictionary.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
raw_data (Any): The input dictionary adhering to Sayou Document Schema.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List[ContentBlock]: A list of normalized content blocks (mostly 'md' type).
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
NormalizationError: If `raw_data` is not a valid dictionary.
|
|
50
|
+
"""
|
|
51
|
+
if not isinstance(raw_data, dict):
|
|
52
|
+
raise NormalizationError(
|
|
53
|
+
f"Input must be a Dictionary, got {type(raw_data).__name__}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
doc_data = raw_data
|
|
57
|
+
blocks: List[ContentBlock] = []
|
|
58
|
+
|
|
59
|
+
if "metadata" in doc_data and doc_data["metadata"]:
|
|
60
|
+
blocks.extend(self._handle_doc_metadata(doc_data))
|
|
61
|
+
|
|
62
|
+
for page in doc_data.get("pages", []):
|
|
63
|
+
if self.include_headers and "header_elements" in page:
|
|
64
|
+
for element in page.get("header_elements", []):
|
|
65
|
+
blocks.extend(
|
|
66
|
+
self._handle_element(element, is_header=True, is_footer=False)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
for element in page.get("elements", []):
|
|
70
|
+
blocks.extend(
|
|
71
|
+
self._handle_element(element, is_header=False, is_footer=False)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if self.include_footers and "footer_elements" in page:
|
|
75
|
+
for element in page.get("footer_elements", []):
|
|
76
|
+
# T2의 기본 규칙: include_footers가 True여도 _handle_element에서
|
|
77
|
+
# is_footer=True 플래그를 보고 무시할 수 있음 (T3가 오버라이드 가능)
|
|
78
|
+
blocks.extend(
|
|
79
|
+
self._handle_element(element, is_header=False, is_footer=True)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return blocks
|
|
83
|
+
|
|
84
|
+
def _handle_element(
|
|
85
|
+
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
86
|
+
) -> List[ContentBlock]:
|
|
87
|
+
"""
|
|
88
|
+
Dispatch the element to specific handlers based on its 'type' field.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
element (Dict[str, Any]): The element dictionary.
|
|
92
|
+
is_header (bool): True if the element is part of the page header.
|
|
93
|
+
is_footer (bool): True if the element is part of the page footer.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
List[ContentBlock]: The resulting block(s) from the element.
|
|
97
|
+
"""
|
|
98
|
+
if is_footer and not self.include_footers:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
elem_type = element.get("type")
|
|
102
|
+
|
|
103
|
+
if elem_type == "text":
|
|
104
|
+
return self._handle_text(element, is_header, is_footer)
|
|
105
|
+
|
|
106
|
+
if elem_type == "table":
|
|
107
|
+
return self._handle_table(element, is_header, is_footer)
|
|
108
|
+
|
|
109
|
+
if elem_type == "image":
|
|
110
|
+
return self._handle_image(element, is_header, is_footer)
|
|
111
|
+
|
|
112
|
+
if elem_type == "chart":
|
|
113
|
+
return self._handle_chart(element, is_header, is_footer)
|
|
114
|
+
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[ContentBlock]:
|
|
118
|
+
"""
|
|
119
|
+
Convert document-level metadata into a Markdown Frontmatter block.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
doc_data (Dict[str, Any]): The root document dictionary containing 'metadata'.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List[ContentBlock]: A single block containing YAML-like frontmatter.
|
|
126
|
+
"""
|
|
127
|
+
md_frontmatter = "---\n"
|
|
128
|
+
metadata = doc_data.get("metadata", {})
|
|
129
|
+
|
|
130
|
+
title = metadata.get("title")
|
|
131
|
+
author = metadata.get("author")
|
|
132
|
+
|
|
133
|
+
if title:
|
|
134
|
+
md_frontmatter += f"title: {title}\n"
|
|
135
|
+
if author:
|
|
136
|
+
md_frontmatter += f"author: {author}\n"
|
|
137
|
+
md_frontmatter += "---\n\n"
|
|
138
|
+
|
|
139
|
+
return [
|
|
140
|
+
ContentBlock(
|
|
141
|
+
type="md",
|
|
142
|
+
content=md_frontmatter,
|
|
143
|
+
metadata={"page_num": 0, "id": "metadata", "is_footer": False},
|
|
144
|
+
)
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
def _handle_text(
|
|
148
|
+
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
149
|
+
) -> List[ContentBlock]:
|
|
150
|
+
"""
|
|
151
|
+
Convert a text element to a Markdown block, handling headings and lists.
|
|
152
|
+
|
|
153
|
+
Uses 'semantic_type' (heading/list) and 'level' attributes to generate
|
|
154
|
+
appropriate Markdown syntax (e.g., '# Title', '- Item').
|
|
155
|
+
"""
|
|
156
|
+
text = element.get("text", "").strip()
|
|
157
|
+
if not text:
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
raw_attrs = element.get("raw_attributes", {})
|
|
161
|
+
semantic_type = raw_attrs.get("semantic_type")
|
|
162
|
+
|
|
163
|
+
content = None
|
|
164
|
+
|
|
165
|
+
# 1. '●' (List) 처리
|
|
166
|
+
if semantic_type == "list":
|
|
167
|
+
level = raw_attrs.get("list_level", 0)
|
|
168
|
+
indent = " " * level
|
|
169
|
+
content = f"{indent}- {text}"
|
|
170
|
+
|
|
171
|
+
# 2. 'Heading 1-9' (제목) 처리
|
|
172
|
+
elif semantic_type == "heading":
|
|
173
|
+
level = raw_attrs.get("heading_level", 1)
|
|
174
|
+
hashes = "#" * level
|
|
175
|
+
content = f"{hashes} {text}"
|
|
176
|
+
|
|
177
|
+
# 3. PPT 플레이스홀더 (레거시 호환)
|
|
178
|
+
elif raw_attrs.get("placeholder_type") == "TITLE":
|
|
179
|
+
content = f"# {text}"
|
|
180
|
+
|
|
181
|
+
# 4. 그 외 (기본 텍스트)
|
|
182
|
+
else:
|
|
183
|
+
content = text
|
|
184
|
+
|
|
185
|
+
return [
|
|
186
|
+
ContentBlock(
|
|
187
|
+
type="md",
|
|
188
|
+
content=content,
|
|
189
|
+
metadata={
|
|
190
|
+
"page_num": element.get("meta", {}).get("page_num"),
|
|
191
|
+
"id": element.get("id"),
|
|
192
|
+
"style": raw_attrs.get("style"),
|
|
193
|
+
"is_footer": is_footer,
|
|
194
|
+
},
|
|
195
|
+
)
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
def _handle_table(
|
|
199
|
+
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
200
|
+
) -> List[ContentBlock]:
|
|
201
|
+
"""
|
|
202
|
+
Convert a table element into a Markdown table representation.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
element (Dict[str, Any]): Must contain 'data' (2D list).
|
|
206
|
+
"""
|
|
207
|
+
md_table = ""
|
|
208
|
+
table_data = element.get("data", [])
|
|
209
|
+
|
|
210
|
+
if not table_data:
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
max_cols = 0
|
|
214
|
+
for row in table_data:
|
|
215
|
+
if row:
|
|
216
|
+
max_cols = max(max_cols, len(row))
|
|
217
|
+
|
|
218
|
+
if max_cols == 0:
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
# 1. 헤더 행 (첫 번째 행)
|
|
222
|
+
header = table_data[0]
|
|
223
|
+
header_cells = list(map(str, header)) + [""] * (max_cols - len(header))
|
|
224
|
+
md_table += "| " + " | ".join(header_cells) + " |\n"
|
|
225
|
+
|
|
226
|
+
# 2. 구분자 행 (최대 열 개수 기준)
|
|
227
|
+
md_table += "| " + " | ".join(["---"] * max_cols) + " |\n"
|
|
228
|
+
|
|
229
|
+
# 3. 본문 행 (두 번째 행부터)
|
|
230
|
+
for row in table_data[1:]:
|
|
231
|
+
body_cells = list(map(str, row)) + [""] * (max_cols - len(row))
|
|
232
|
+
md_table += "| " + " | ".join(body_cells) + " |\n"
|
|
233
|
+
|
|
234
|
+
return [
|
|
235
|
+
ContentBlock(
|
|
236
|
+
type="md",
|
|
237
|
+
content=md_table.strip(),
|
|
238
|
+
metadata={
|
|
239
|
+
"page_num": element.get("meta", {}).get("page_num"),
|
|
240
|
+
"id": element.get("id"),
|
|
241
|
+
"is_footer": is_footer,
|
|
242
|
+
},
|
|
243
|
+
)
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
def _handle_image(
|
|
247
|
+
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
248
|
+
) -> List[ContentBlock]:
|
|
249
|
+
"""
|
|
250
|
+
Process an image element.
|
|
251
|
+
|
|
252
|
+
Depending on implementation, this might return an 'image_base64' block
|
|
253
|
+
or a Markdown image link if an external URL is provided.
|
|
254
|
+
"""
|
|
255
|
+
image_base64 = element.get("image_base64")
|
|
256
|
+
image_base64 = element.get("image_base64")
|
|
257
|
+
if not image_base64:
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
ocr_text = (element.get("ocr_text") or "").strip()
|
|
261
|
+
if not ocr_text:
|
|
262
|
+
alt_text = "image"
|
|
263
|
+
else:
|
|
264
|
+
alt_text = ocr_text
|
|
265
|
+
|
|
266
|
+
img_format = element.get("image_format", "png")
|
|
267
|
+
|
|
268
|
+
return [
|
|
269
|
+
ContentBlock(
|
|
270
|
+
type="image_base64",
|
|
271
|
+
content=image_base64,
|
|
272
|
+
metadata={
|
|
273
|
+
"page_num": element.get("meta", {}).get("page_num"),
|
|
274
|
+
"id": element.get("id"),
|
|
275
|
+
"is_footer": is_footer,
|
|
276
|
+
"alt_text": alt_text,
|
|
277
|
+
"format": img_format,
|
|
278
|
+
},
|
|
279
|
+
)
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
def _handle_chart(
|
|
283
|
+
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
|
284
|
+
) -> List[ContentBlock]:
|
|
285
|
+
"""
|
|
286
|
+
Convert a chart element into its text representation.
|
|
287
|
+
|
|
288
|
+
Uses the 'text_representation' field from the element to create
|
|
289
|
+
a descriptive text block for LLM consumption.
|
|
290
|
+
"""
|
|
291
|
+
text_rep = element.get("text_representation")
|
|
292
|
+
if not text_rep:
|
|
293
|
+
return []
|
|
294
|
+
|
|
295
|
+
content = f"--- Chart Data ---\n{text_rep}\n--------------------\n"
|
|
296
|
+
|
|
297
|
+
return [
|
|
298
|
+
ContentBlock(
|
|
299
|
+
type="md",
|
|
300
|
+
content=content,
|
|
301
|
+
metadata={
|
|
302
|
+
"page_num": element.get("meta", {}).get("page_num"),
|
|
303
|
+
"id": element.get("id"),
|
|
304
|
+
"is_footer": is_footer,
|
|
305
|
+
},
|
|
306
|
+
)
|
|
307
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
except ImportError:
|
|
4
|
+
BeautifulSoup = None
|
|
5
|
+
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from ..core.exceptions import NormalizationError
|
|
9
|
+
from ..core.schemas import ContentBlock
|
|
10
|
+
from ..interfaces.base_normalizer import BaseNormalizer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HtmlTextNormalizer(BaseNormalizer):
|
|
14
|
+
"""
|
|
15
|
+
(Tier 2) Converts HTML string into a clean Text ContentBlock.
|
|
16
|
+
|
|
17
|
+
Uses BeautifulSoup to strip tags, scripts, and styles, returning only
|
|
18
|
+
the visible text content while preserving paragraph structure.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
component_name = "HtmlTextNormalizer"
|
|
22
|
+
SUPPORTED_TYPES = ["html"]
|
|
23
|
+
|
|
24
|
+
def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
|
|
25
|
+
"""
|
|
26
|
+
Parse HTML and extract text.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
raw_data (Any): The input HTML string.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List[ContentBlock]: A single block of type 'text'.
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
ImportError: If BeautifulSoup4 is not installed.
|
|
36
|
+
NormalizationError: If input is not a string.
|
|
37
|
+
"""
|
|
38
|
+
if not BeautifulSoup:
|
|
39
|
+
raise ImportError("BeautifulSoup4 is required for HtmlTextNormalizer.")
|
|
40
|
+
|
|
41
|
+
if not isinstance(raw_data, str):
|
|
42
|
+
raise NormalizationError(
|
|
43
|
+
f"Input must be HTML string, got {type(raw_data)}."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
soup = BeautifulSoup(raw_data, "html.parser")
|
|
47
|
+
|
|
48
|
+
for tag in soup(["script", "style", "noscript", "iframe"]):
|
|
49
|
+
tag.extract()
|
|
50
|
+
|
|
51
|
+
text = soup.get_text(separator="\n")
|
|
52
|
+
|
|
53
|
+
import re
|
|
54
|
+
|
|
55
|
+
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
|
56
|
+
|
|
57
|
+
return [
|
|
58
|
+
ContentBlock(type="text", content=text, metadata={"source_type": "html"})
|
|
59
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from ..core.exceptions import NormalizationError
|
|
4
|
+
from ..core.schemas import ContentBlock
|
|
5
|
+
from ..interfaces.base_normalizer import BaseNormalizer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RecordNormalizer(BaseNormalizer):
|
|
9
|
+
"""
|
|
10
|
+
(Tier 2) Converts structured data (Dict/List) into 'record' ContentBlocks.
|
|
11
|
+
|
|
12
|
+
Suitable for processing database rows, CSV records, or JSON API responses.
|
|
13
|
+
Each dictionary becomes a separate ContentBlock of type 'record'.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
component_name = "RecordNormalizer"
|
|
17
|
+
SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
|
|
18
|
+
|
|
19
|
+
def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
|
|
20
|
+
"""
|
|
21
|
+
Convert dict or list of dicts into record blocks.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
raw_data (Any): A Dictionary or a List of Dictionaries.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List[ContentBlock]: Blocks of type 'record'.
|
|
28
|
+
"""
|
|
29
|
+
blocks = []
|
|
30
|
+
|
|
31
|
+
# Case 1: Single Dictionary
|
|
32
|
+
if isinstance(raw_data, dict):
|
|
33
|
+
blocks.append(self._create_block(raw_data))
|
|
34
|
+
|
|
35
|
+
# Case 2: List of Dictionaries
|
|
36
|
+
elif isinstance(raw_data, list):
|
|
37
|
+
for item in raw_data:
|
|
38
|
+
if isinstance(item, dict):
|
|
39
|
+
blocks.append(self._create_block(item))
|
|
40
|
+
else:
|
|
41
|
+
self._log(
|
|
42
|
+
f"Skipping non-dict item in list: {type(item)}", level="warning"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
else:
|
|
46
|
+
raise NormalizationError(
|
|
47
|
+
f"Input must be Dict or List[Dict], got {type(raw_data)}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return blocks
|
|
51
|
+
|
|
52
|
+
def _create_block(self, data: Dict[str, Any]) -> ContentBlock:
|
|
53
|
+
"""
|
|
54
|
+
Helper to wrap a single dictionary into a ContentBlock.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
data (Dict[str, Any]): The data record.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
ContentBlock: A block with type='record' and content=data.
|
|
61
|
+
"""
|
|
62
|
+
return ContentBlock(
|
|
63
|
+
type="record",
|
|
64
|
+
content=data,
|
|
65
|
+
metadata={"fields": list(data.keys())},
|
|
66
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from sayou.core.base_component import BaseComponent
|
|
4
|
+
from sayou.core.decorators import safe_run
|
|
5
|
+
|
|
6
|
+
from .core.exceptions import RefineryError
|
|
7
|
+
from .core.schemas import ContentBlock
|
|
8
|
+
from .interfaces.base_normalizer import BaseNormalizer
|
|
9
|
+
from .interfaces.base_processor import BaseProcessor
|
|
10
|
+
from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
|
|
11
|
+
from .normalizer.html_text_normalizer import HtmlTextNormalizer
|
|
12
|
+
from .normalizer.record_normalizer import RecordNormalizer
|
|
13
|
+
from .processor.deduplicator import Deduplicator
|
|
14
|
+
from .processor.imputer import Imputer
|
|
15
|
+
from .processor.outlier_handler import OutlierHandler
|
|
16
|
+
from .processor.pii_masker import PiiMasker
|
|
17
|
+
from .processor.text_cleaner import TextCleaner
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RefineryPipeline(BaseComponent):
|
|
21
|
+
"""
|
|
22
|
+
Orchestrates the data refinement process.
|
|
23
|
+
1. Selects a Normalizer to convert raw data into standard ContentBlocks.
|
|
24
|
+
2. Runs a chain of Processors to clean and transform the blocks.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
component_name = "RefineryPipeline"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
extra_normalizers: Optional[List[BaseNormalizer]] = None,
|
|
32
|
+
processors: Optional[List[BaseProcessor]] = None,
|
|
33
|
+
):
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.normalizers: Dict[str, BaseNormalizer] = {}
|
|
36
|
+
|
|
37
|
+
# 1. Register Default Normalizers
|
|
38
|
+
defaults = [DocMarkdownNormalizer(), HtmlTextNormalizer(), RecordNormalizer()]
|
|
39
|
+
self._register(defaults)
|
|
40
|
+
|
|
41
|
+
# 2. Register User Extras
|
|
42
|
+
if extra_normalizers:
|
|
43
|
+
self._register(extra_normalizers)
|
|
44
|
+
|
|
45
|
+
# 3. Setup Processors Chain
|
|
46
|
+
self.processors = (
|
|
47
|
+
processors
|
|
48
|
+
if processors is not None
|
|
49
|
+
else [
|
|
50
|
+
TextCleaner(),
|
|
51
|
+
PiiMasker(),
|
|
52
|
+
Deduplicator(),
|
|
53
|
+
Imputer(),
|
|
54
|
+
OutlierHandler(),
|
|
55
|
+
]
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _register(self, comps: List[BaseNormalizer]):
|
|
59
|
+
for c in comps:
|
|
60
|
+
for t in getattr(c, "SUPPORTED_TYPES", []):
|
|
61
|
+
self.normalizers[t] = c
|
|
62
|
+
|
|
63
|
+
@safe_run(default_return=None)
|
|
64
|
+
def initialize(self, **kwargs):
|
|
65
|
+
"""
|
|
66
|
+
Initialize all sub-components (Normalizers and Processors).
|
|
67
|
+
Passes global configuration (like PII masking rules) down to components.
|
|
68
|
+
"""
|
|
69
|
+
for norm in set(self.normalizers.values()):
|
|
70
|
+
norm.initialize(**kwargs)
|
|
71
|
+
|
|
72
|
+
for proc in self.processors:
|
|
73
|
+
proc.initialize(**kwargs)
|
|
74
|
+
|
|
75
|
+
self._log(
|
|
76
|
+
f"Refinery initialized with {len(self.processors)} processors in chain."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def run(
|
|
80
|
+
self, raw_data: Any, source_type: str = "standard_doc"
|
|
81
|
+
) -> List[ContentBlock]:
|
|
82
|
+
"""
|
|
83
|
+
Execute the refinement pipeline.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
raw_data: The raw input data (dict, html string, db row list, etc.)
|
|
87
|
+
source_type: The type of input data (e.g., 'standard_doc', 'html', 'json')
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List[ContentBlock]: A list of clean, normalized blocks.
|
|
91
|
+
"""
|
|
92
|
+
# Step 1: Normalize (Structure Transformation)
|
|
93
|
+
normalizer = self.normalizers.get(source_type)
|
|
94
|
+
if not normalizer:
|
|
95
|
+
supported = list(self.normalizers.keys())
|
|
96
|
+
raise RefineryError(
|
|
97
|
+
f"Unknown source_type '{source_type}'. Supported: {supported}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
blocks = normalizer.normalize(raw_data)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
self.logger.error(f"Normalization step failed: {e}")
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
# Step 2: Process (Content Cleaning)
|
|
107
|
+
# Processors modify blocks in-place or return new lists
|
|
108
|
+
for processor in self.processors:
|
|
109
|
+
blocks = processor.process(blocks)
|
|
110
|
+
|
|
111
|
+
return blocks
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import List, Set
|
|
3
|
+
|
|
4
|
+
from ..core.schemas import ContentBlock
|
|
5
|
+
from ..interfaces.base_processor import BaseProcessor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Deduplicator(BaseProcessor):
|
|
9
|
+
"""
|
|
10
|
+
(Tier 2) Removes duplicate blocks based on content hashing.
|
|
11
|
+
|
|
12
|
+
It computes a hash of the content for each block and filters out
|
|
13
|
+
subsequent blocks that match an already seen hash.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
component_name = "Deduplicator"
|
|
17
|
+
|
|
18
|
+
def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
19
|
+
"""
|
|
20
|
+
Iterate through blocks and remove duplicates.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
blocks (List[ContentBlock]): The input list of blocks.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
List[ContentBlock]: A new list with duplicates removed.
|
|
27
|
+
"""
|
|
28
|
+
seen_hashes: Set[int] = set()
|
|
29
|
+
unique_blocks: List[ContentBlock] = []
|
|
30
|
+
|
|
31
|
+
for block in blocks:
|
|
32
|
+
# Generate stable hash key
|
|
33
|
+
if isinstance(block.content, dict):
|
|
34
|
+
content_str = json.dumps(block.content, sort_keys=True)
|
|
35
|
+
else:
|
|
36
|
+
content_str = str(block.content)
|
|
37
|
+
|
|
38
|
+
if len(content_str) < 5:
|
|
39
|
+
unique_blocks.append(block)
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
content_hash = hash(content_str)
|
|
43
|
+
|
|
44
|
+
if content_hash not in seen_hashes:
|
|
45
|
+
seen_hashes.add(content_hash)
|
|
46
|
+
unique_blocks.append(block)
|
|
47
|
+
|
|
48
|
+
return unique_blocks
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from ..core.schemas import ContentBlock
|
|
4
|
+
from ..interfaces.base_processor import BaseProcessor
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Imputer(BaseProcessor):
|
|
8
|
+
"""
|
|
9
|
+
(Tier 2) Fills missing values in 'record' type blocks using defined rules.
|
|
10
|
+
|
|
11
|
+
Only operates on blocks with type='record' where the content is a dictionary.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
component_name = "Imputer"
|
|
15
|
+
|
|
16
|
+
def initialize(self, imputation_rules: Dict[str, Any] = None, **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
Set imputation rules.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
rules (Dict[str, Any]): Mapping of field names to default values.
|
|
22
|
+
Example: {"category": "Unknown", "price": 0.0}
|
|
23
|
+
**kwargs: Additional arguments.
|
|
24
|
+
"""
|
|
25
|
+
self.rules = imputation_rules or {}
|
|
26
|
+
if not self.rules:
|
|
27
|
+
self._log("Imputer initialized with no rules.", level="warning")
|
|
28
|
+
|
|
29
|
+
def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
30
|
+
"""
|
|
31
|
+
Apply imputation rules to record blocks.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
blocks (List[ContentBlock]): Input blocks.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
List[ContentBlock]: Blocks with missing values filled.
|
|
38
|
+
"""
|
|
39
|
+
for block in blocks:
|
|
40
|
+
if block.type != "record" or not isinstance(block.content, dict):
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
record = block.content
|
|
44
|
+
|
|
45
|
+
for field, default_value in self.rules.items():
|
|
46
|
+
if record.get(field) is None:
|
|
47
|
+
record[field] = default_value
|
|
48
|
+
|
|
49
|
+
block.content = record
|
|
50
|
+
|
|
51
|
+
return blocks
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from ..core.schemas import ContentBlock
|
|
4
|
+
from ..interfaces.base_processor import BaseProcessor
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OutlierHandler(BaseProcessor):
|
|
8
|
+
"""
|
|
9
|
+
(Tier 2) Handles numerical outliers in 'record' blocks.
|
|
10
|
+
|
|
11
|
+
Can either 'drop' the entire block or 'clamp' the value to a boundary
|
|
12
|
+
if a field violates the defined min/max rules.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
component_name = "OutlierHandler"
|
|
16
|
+
|
|
17
|
+
def initialize(self, outlier_rules: Dict[str, Dict[str, Any]] = None, **kwargs):
|
|
18
|
+
"""
|
|
19
|
+
Set outlier handling rules.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
rules (Dict[str, Dict[str, Any]]): Mapping of field names to constraints.
|
|
23
|
+
Example:
|
|
24
|
+
{
|
|
25
|
+
"age": {"min": 0, "max": 120, "action": "drop"},
|
|
26
|
+
"score": {"min": 0, "max": 100, "action": "clamp"}
|
|
27
|
+
}
|
|
28
|
+
**kwargs: Additional arguments.
|
|
29
|
+
"""
|
|
30
|
+
self.rules = outlier_rules or {}
|
|
31
|
+
|
|
32
|
+
def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
33
|
+
"""
|
|
34
|
+
Check numerical fields against rules and filter/modify blocks.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
blocks (List[ContentBlock]): Input blocks.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List[ContentBlock]: Filtered or modified list of blocks.
|
|
41
|
+
"""
|
|
42
|
+
valid_blocks = []
|
|
43
|
+
|
|
44
|
+
for block in blocks:
|
|
45
|
+
if block.type != "record" or not isinstance(block.content, dict):
|
|
46
|
+
valid_blocks.append(block)
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
record = block.content
|
|
50
|
+
should_drop = False
|
|
51
|
+
|
|
52
|
+
for field, rule in self.rules.items():
|
|
53
|
+
val = record.get(field)
|
|
54
|
+
if val is None:
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
val_f = float(val)
|
|
59
|
+
min_v = rule.get("min")
|
|
60
|
+
max_v = rule.get("max")
|
|
61
|
+
action = rule.get("action", "drop")
|
|
62
|
+
|
|
63
|
+
if min_v is not None and val_f < min_v:
|
|
64
|
+
if action == "drop":
|
|
65
|
+
should_drop = True
|
|
66
|
+
break
|
|
67
|
+
elif action == "clamp":
|
|
68
|
+
record[field] = min_v
|
|
69
|
+
|
|
70
|
+
if max_v is not None and val_f > max_v:
|
|
71
|
+
if action == "drop":
|
|
72
|
+
should_drop = True
|
|
73
|
+
break
|
|
74
|
+
elif action == "clamp":
|
|
75
|
+
record[field] = max_v
|
|
76
|
+
|
|
77
|
+
except (ValueError, TypeError):
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
if not should_drop:
|
|
81
|
+
block.content = record
|
|
82
|
+
valid_blocks.append(block)
|
|
83
|
+
|
|
84
|
+
return valid_blocks
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from ..core.schemas import ContentBlock
|
|
5
|
+
from ..interfaces.base_processor import BaseProcessor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PiiMasker(BaseProcessor):
|
|
9
|
+
"""
|
|
10
|
+
(Tier 2) Masks Personally Identifiable Information (PII) in text blocks.
|
|
11
|
+
|
|
12
|
+
Uses Regex patterns to identify and redact sensitive data like emails
|
|
13
|
+
and phone numbers in 'text' and 'md' blocks.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
component_name = "PiiMasker"
|
|
17
|
+
|
|
18
|
+
def initialize(self, mask_email: bool = True, mask_phone: bool = True, **kwargs):
|
|
19
|
+
"""
|
|
20
|
+
Configure masking targets.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
mask_email (bool): Whether to mask email addresses (default: True).
|
|
24
|
+
mask_phone (bool): Whether to mask phone numbers (default: True).
|
|
25
|
+
**kwargs: Additional arguments.
|
|
26
|
+
"""
|
|
27
|
+
self.mask_email = mask_email
|
|
28
|
+
self.mask_phone = mask_phone
|
|
29
|
+
self._email_re = re.compile(r"[\w\.-]+@[\w\.-]+")
|
|
30
|
+
# Simple phone regex (customizable)
|
|
31
|
+
self._phone_re = re.compile(r"\d{3}[-\.\s]??\d{3,4}[-\.\s]??\d{4}")
|
|
32
|
+
|
|
33
|
+
def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
34
|
+
"""
|
|
35
|
+
Apply masking regex to text content.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
blocks (List[ContentBlock]): Input blocks.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List[ContentBlock]: Blocks with sensitive info replaced by tokens.
|
|
42
|
+
"""
|
|
43
|
+
for block in blocks:
|
|
44
|
+
if block.type not in ["text", "md"] or not isinstance(block.content, str):
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
if self.mask_email:
|
|
48
|
+
block.content = self._email_re.sub("[EMAIL]", block.content)
|
|
49
|
+
if self.mask_phone:
|
|
50
|
+
block.content = self._phone_re.sub("[PHONE]", block.content)
|
|
51
|
+
|
|
52
|
+
return blocks
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from ..core.schemas import ContentBlock
|
|
5
|
+
from ..interfaces.base_processor import BaseProcessor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TextCleaner(BaseProcessor):
|
|
9
|
+
"""
|
|
10
|
+
(Tier 2) Cleans text content using regex and whitespace normalization.
|
|
11
|
+
|
|
12
|
+
Operates on 'text' and 'md' blocks to remove noise characters or custom patterns.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
component_name = "TextCleaner"
|
|
16
|
+
|
|
17
|
+
def initialize(
|
|
18
|
+
self, patterns: List[str] = None, normalize_space: bool = True, **kwargs
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Configure cleaning patterns.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
patterns (List[str]): List of regex patterns to remove from text.
|
|
25
|
+
normalize_space (bool): If True, collapses multiple spaces/tabs/newlines.
|
|
26
|
+
**kwargs: Additional arguments.
|
|
27
|
+
"""
|
|
28
|
+
self.normalize_space = normalize_space
|
|
29
|
+
self.patterns = [re.compile(p) for p in (patterns or [])]
|
|
30
|
+
self._space_re = re.compile(r"[ \t]+")
|
|
31
|
+
|
|
32
|
+
def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
|
|
33
|
+
"""
|
|
34
|
+
Apply cleaning logic to text blocks.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
blocks (List[ContentBlock]): Input blocks.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List[ContentBlock]: Cleaned blocks.
|
|
41
|
+
"""
|
|
42
|
+
for block in blocks:
|
|
43
|
+
if block.type not in ["text", "md"]:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
if not isinstance(block.content, str):
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
text = block.content
|
|
50
|
+
|
|
51
|
+
# 1. Custom Patterns Removal
|
|
52
|
+
for pat in self.patterns:
|
|
53
|
+
text = pat.sub("", text)
|
|
54
|
+
|
|
55
|
+
# 2. Whitespace Normalization
|
|
56
|
+
if self.normalize_space:
|
|
57
|
+
text = self._space_re.sub(" ", text)
|
|
58
|
+
|
|
59
|
+
block.content = text.strip()
|
|
60
|
+
|
|
61
|
+
return blocks
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sayou-refinery
|
|
3
|
+
Version: 0.1.6
|
|
4
|
+
Summary: Refinery components for the Sayou Data Platform
|
|
5
|
+
Project-URL: Homepage, https://www.sayouzone.com/
|
|
6
|
+
Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
|
|
7
|
+
Project-URL: Repository, https://github.com/sayouzone/sayou-fabric
|
|
8
|
+
Author-email: Sayouzone <contact@sayouzone.com>
|
|
9
|
+
License: Apache License
|
|
10
|
+
Version 2.0, January 2004
|
|
11
|
+
http://www.apache.org/licenses/
|
|
12
|
+
|
|
13
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
14
|
+
|
|
15
|
+
1. Definitions.
|
|
16
|
+
|
|
17
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
18
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
19
|
+
|
|
20
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
21
|
+
the copyright owner that is granting the License.
|
|
22
|
+
|
|
23
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
24
|
+
other entities that control, are controlled by, or are under common
|
|
25
|
+
control with that entity. For the purposes of this definition,
|
|
26
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
27
|
+
direction or management of such entity, whether by contract or
|
|
28
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
29
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
30
|
+
|
|
31
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
32
|
+
exercising permissions granted by this License.
|
|
33
|
+
|
|
34
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
35
|
+
including but not limited to software source code, documentation
|
|
36
|
+
source, and configuration files.
|
|
37
|
+
|
|
38
|
+
"Object" form shall mean any form resulting from mechanical
|
|
39
|
+
transformation or translation of a Source form, including but
|
|
40
|
+
not limited to compiled object code, generated documentation,
|
|
41
|
+
and conversions to other media types.
|
|
42
|
+
|
|
43
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
44
|
+
Object form, made available under the License, as indicated by a
|
|
45
|
+
copyright notice that is included in or attached to the work
|
|
46
|
+
(an example is provided in the Appendix below).
|
|
47
|
+
|
|
48
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
49
|
+
form, that is based on (or derived from) the Work and for which the
|
|
50
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
51
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
52
|
+
of this License, Derivative Works shall not include works that remain
|
|
53
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
54
|
+
the Work and Derivative Works thereof.
|
|
55
|
+
|
|
56
|
+
"Contribution" shall mean any work of authorship, including
|
|
57
|
+
the original version of the Work and any modifications or additions
|
|
58
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
59
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
60
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
61
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
62
|
+
means any form of electronic, verbal, or written communication sent
|
|
63
|
+
to the Licensor or its representatives, including but not limited to
|
|
64
|
+
communication on electronic mailing lists, source code control systems,
|
|
65
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
66
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
67
|
+
excluding communication that is conspicuously marked or otherwise
|
|
68
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
69
|
+
|
|
70
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
71
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
72
|
+
subsequently incorporated within the Work.
|
|
73
|
+
|
|
74
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
75
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
76
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
77
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
78
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
79
|
+
Work and such Derivative Works in Source or Object form.
|
|
80
|
+
|
|
81
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
82
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
83
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
84
|
+
(except as stated in this section) patent license to make, have made,
|
|
85
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
86
|
+
where such license applies only to those patent claims licensable
|
|
87
|
+
by such Contributor that are necessarily infringed by their
|
|
88
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
89
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
90
|
+
institute patent litigation against any entity (including a
|
|
91
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
92
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
93
|
+
or contributory patent infringement, then any patent licenses
|
|
94
|
+
granted to You under this License for that Work shall terminate
|
|
95
|
+
as of the date such litigation is filed.
|
|
96
|
+
|
|
97
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
98
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
99
|
+
modifications, and in Source or Object form, provided that You
|
|
100
|
+
meet the following conditions:
|
|
101
|
+
|
|
102
|
+
(a) You must give any other recipients of the Work or
|
|
103
|
+
Derivative Works a copy of this License; and
|
|
104
|
+
|
|
105
|
+
(b) You must cause any modified files to carry prominent notices
|
|
106
|
+
stating that You changed the files; and
|
|
107
|
+
|
|
108
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
109
|
+
that You distribute, all copyright, patent, trademark, and
|
|
110
|
+
attribution notices from the Source form of the Work,
|
|
111
|
+
excluding those notices that do not pertain to any part of
|
|
112
|
+
the Derivative Works; and
|
|
113
|
+
|
|
114
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
115
|
+
distribution, then any Derivative Works that You distribute must
|
|
116
|
+
include a readable copy of the attribution notices contained
|
|
117
|
+
within such NOTICE file, excluding those notices that do not
|
|
118
|
+
pertain to any part of the Derivative Works, in at least one
|
|
119
|
+
of the following places: within a NOTICE text file distributed
|
|
120
|
+
as part of the Derivative Works; within the Source form or
|
|
121
|
+
documentation, if provided along with the Derivative Works; or,
|
|
122
|
+
within a display generated by the Derivative Works, if and
|
|
123
|
+
wherever such third-party notices normally appear. The contents
|
|
124
|
+
of the NOTICE file are for informational purposes only and
|
|
125
|
+
do not modify the License. You may add Your own attribution
|
|
126
|
+
notices within Derivative Works that You distribute, alongside
|
|
127
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
128
|
+
that such additional attribution notices cannot be construed
|
|
129
|
+
as modifying the License.
|
|
130
|
+
|
|
131
|
+
You may add Your own copyright statement to Your modifications and
|
|
132
|
+
may provide additional or different license terms and conditions
|
|
133
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
134
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
135
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
136
|
+
the conditions stated in this License.
|
|
137
|
+
|
|
138
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
139
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
140
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
141
|
+
this License, without any additional terms or conditions.
|
|
142
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
143
|
+
the terms of any separate license agreement you may have executed
|
|
144
|
+
with Licensor regarding such Contributions.
|
|
145
|
+
|
|
146
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
147
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
148
|
+
except as required for reasonable and customary use in describing the
|
|
149
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
150
|
+
|
|
151
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
152
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
153
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
154
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
155
|
+
implied, including, without limitation, any warranties or conditions
|
|
156
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
157
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
158
|
+
appropriateness of using or redistributing the Work and assume any
|
|
159
|
+
risks associated with Your exercise of permissions under this License.
|
|
160
|
+
|
|
161
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
162
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
163
|
+
unless required by applicable law (such as deliberate and grossly
|
|
164
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
165
|
+
liable to You for damages, including any direct, indirect, special,
|
|
166
|
+
incidental, or consequential damages of any character arising as a
|
|
167
|
+
result of this License or out of the use or inability to use the
|
|
168
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
169
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
170
|
+
other commercial damages or losses), even if such Contributor
|
|
171
|
+
has been advised of the possibility of such damages.
|
|
172
|
+
|
|
173
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
174
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
175
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
176
|
+
or other liability obligations and/or rights consistent with this
|
|
177
|
+
License. However, in accepting such obligations, You may act only
|
|
178
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
179
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
180
|
+
defend, and hold each Contributor harmless for any liability
|
|
181
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
182
|
+
of your accepting any such warranty or additional liability.
|
|
183
|
+
|
|
184
|
+
END OF TERMS AND CONDITIONS
|
|
185
|
+
|
|
186
|
+
APPENDIX: How to apply the Apache License to your work.
|
|
187
|
+
|
|
188
|
+
To apply the Apache License to your work, attach the following
|
|
189
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
190
|
+
replaced with your own identifying information. (Don't include
|
|
191
|
+
the brackets!) The text should be enclosed in the appropriate
|
|
192
|
+
comment syntax for the file format. We also recommend that a
|
|
193
|
+
file or class name and description of purpose be included on the
|
|
194
|
+
same "printed page" as the copyright notice for easier
|
|
195
|
+
identification within third-party archives.
|
|
196
|
+
|
|
197
|
+
Copyright [yyyy] [name of copyright owner]
|
|
198
|
+
|
|
199
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
200
|
+
you may not use this file except in compliance with the License.
|
|
201
|
+
You may obtain a copy of the License at
|
|
202
|
+
|
|
203
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
204
|
+
|
|
205
|
+
Unless required by applicable law or agreed to in writing, software
|
|
206
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
207
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
208
|
+
See the License for the specific language governing permissions and
|
|
209
|
+
limitations under the License.
|
|
210
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
211
|
+
Classifier: Operating System :: OS Independent
|
|
212
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
213
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
214
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
215
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
216
|
+
Requires-Python: >=3.9
|
|
217
|
+
Requires-Dist: sayou-core~=0.1.2
|
|
218
|
+
Description-Content-Type: text/markdown
|
|
219
|
+
|
|
220
|
+
# sayou-refinery
|
|
221
|
+
|
|
222
|
+
[](https://pypi.org/project/sayou-refinery/)
|
|
223
|
+
[](https://www.apache.org/licenses/LICENSE-2.0)
|
|
224
|
+
[](https://sayouzone.github.io/sayou-fabric/library-guides/refinery/)
|
|
225
|
+
|
|
226
|
+
**The Universal Data Cleaning & Normalization Engine for Sayou Fabric.**
|
|
227
|
+
|
|
228
|
+
`sayou-refinery` acts as the "Cleaning Plant" in your data pipeline.
|
|
229
|
+
|
|
230
|
+
It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a standardized stream of **ContentBlocks**, ensuring that downstream components (like Chunkers or LLMs) receive clean, uniform data regardless of the original source format.
|
|
231
|
+
|
|
232
|
+
## 💡 Core Philosophy
|
|
233
|
+
|
|
234
|
+
**"Flatten Structure, Polish Content."**
|
|
235
|
+
|
|
236
|
+
Refinery operates in two distinct stages to guarantee data quality:
|
|
237
|
+
|
|
238
|
+
1. **Normalization (Shape Shifting):** Converts complex structures (nested JSON, HTML trees, DB Rows) into a linear list of `ContentBlocks`.
|
|
239
|
+
2. **Processing (Cleaning):** Applies a chain of cleaning agents (Regex, Masking, Deduplication) to improve data hygiene.
|
|
240
|
+
|
|
241
|
+
## 📦 Installation
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
pip install sayou-refinery
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## ⚡ Quick Start
|
|
248
|
+
|
|
249
|
+
The `RefineryPipeline` orchestrates the normalization and processing chain.
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
from sayou.refinery.pipeline import RefineryPipeline
|
|
253
|
+
|
|
254
|
+
def run_demo():
|
|
255
|
+
# 1. Initialize with specific cleaning rules
|
|
256
|
+
pipeline = RefineryPipeline()
|
|
257
|
+
pipeline.initialize(
|
|
258
|
+
mask_email=True,
|
|
259
|
+
outlier_rules={"price": {"min": 0, "max": 1000, "action": "clamp"}}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# 2. Raw Data (e.g., from sayou-document)
|
|
263
|
+
raw_doc = {
|
|
264
|
+
"metadata": {"title": "Test Doc"},
|
|
265
|
+
"pages": [{
|
|
266
|
+
"elements": [
|
|
267
|
+
{"type": "text", "text": "Contact: admin@sayou.ai"},
|
|
268
|
+
{"type": "text", "text": " Dirty Whitespace "}
|
|
269
|
+
]
|
|
270
|
+
}]
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# 3. Run Pipeline
|
|
274
|
+
# source_type: 'standard_doc', 'html', 'json', etc.
|
|
275
|
+
blocks = pipeline.run(raw_doc, source_type="standard_doc")
|
|
276
|
+
|
|
277
|
+
# 4. Result
|
|
278
|
+
for block in blocks:
|
|
279
|
+
print(f"[{block.type}] {block.content}")
|
|
280
|
+
|
|
281
|
+
# Output:
|
|
282
|
+
# [md_meta] --- title: Test Doc ...
|
|
283
|
+
# [md] Contact: [EMAIL]
|
|
284
|
+
# [md] Dirty Whitespace
|
|
285
|
+
|
|
286
|
+
if __name__ == "__main__":
|
|
287
|
+
run_demo()
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## 🔑 Key Components
|
|
291
|
+
|
|
292
|
+
### Normalizers
|
|
293
|
+
* **`DocMarkdownNormalizer`**: Converts Sayou Document Dicts into Markdown blocks.
|
|
294
|
+
* **`HtmlTextNormalizer`**: Strips HTML tags and scripts, extracting clean text.
|
|
295
|
+
* **`RecordNormalizer`**: Converts DB rows or JSON objects into 'record' blocks.
|
|
296
|
+
|
|
297
|
+
### Processors
|
|
298
|
+
* **`TextCleaner`**: Normalizes whitespace and removes noise via regex.
|
|
299
|
+
* **`PiiMasker`**: Masks sensitive info like emails and phone numbers.
|
|
300
|
+
* **`Deduplicator`**: Removes duplicate content blocks.
|
|
301
|
+
* **`Imputer`**: Fills missing values in record blocks.
|
|
302
|
+
* **`OutlierHandler`**: Filters or clamps numerical outliers in records.
|
|
303
|
+
|
|
304
|
+
## 🤝 Contributing
|
|
305
|
+
|
|
306
|
+
We welcome contributions for new Normalizers (e.g., `CsvNormalizer`, `LogNormalizer`) or Processors (e.g., `LangChainFilter`).
|
|
307
|
+
|
|
308
|
+
## 📜 License
|
|
309
|
+
|
|
310
|
+
Apache 2.0 License © 2025 Sayouzone
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
sayou/refinery/pipeline.py,sha256=oJbygy300ounS3xL3UdCpwnmdmUTRib-W-ADsPJ1Vjs,3756
|
|
2
|
+
sayou/refinery/core/exceptions.py,sha256=LhY8tDk9UzqjGjy-7UPpzBSRpH4vUl3ZemmW-BssdJY,547
|
|
3
|
+
sayou/refinery/core/schemas.py,sha256=LhKV5X8WIiUV273OpX9y7TduQUX03qZh-rgRSMn_eCs,727
|
|
4
|
+
sayou/refinery/interfaces/base_normalizer.py,sha256=nYQ40IM83WXnIiSIDqhWHfiHAC0O4F9iymb7-u7cSIE,1862
|
|
5
|
+
sayou/refinery/interfaces/base_processor.py,sha256=A9YvD1ZwHhlK290Y77xkSeJTtBCJ53wAYI-KeuVgShM,1650
|
|
6
|
+
sayou/refinery/normalizer/doc_markdown_normalizer.py,sha256=ZVkTEjCesrbjWRRetD1Lp6_YjHsdDL9GNXO4yvUYIUw,10277
|
|
7
|
+
sayou/refinery/normalizer/html_text_normalizer.py,sha256=hX0UTbJwND0Rv-_HuGL-p4Popdrg6_m_mDkKZDYW5AE,1675
|
|
8
|
+
sayou/refinery/normalizer/record_normalizer.py,sha256=bzErNEVw9g-QtQiq_wdm_AxiZi_uuvtx23AL8DRjgxQ,2029
|
|
9
|
+
sayou/refinery/processor/deduplicator.py,sha256=yKZkaPyY4P_a-IwIK8f3XCYqgnoF0us0NbMsBDY03ic,1402
|
|
10
|
+
sayou/refinery/processor/imputer.py,sha256=vvaGvxQNajKSjbYr-gNHmd4HYsD4FtchJhGfnKv-cpo,1562
|
|
11
|
+
sayou/refinery/processor/outlier_handler.py,sha256=bWcECxwVVvfcjiy5lm3YYABUA_7lIW_Do5Q70rK-mtM,2693
|
|
12
|
+
sayou/refinery/processor/pii_masker.py,sha256=fbXPE2HLESQFUvNITnLp-9q2L4MEPcfIkLUUGRIva8k,1726
|
|
13
|
+
sayou/refinery/processor/text_cleaner.py,sha256=8_Hu6H_W__tNfwebm9cS43DRc8EExkhfpkmxslShDjU,1748
|
|
14
|
+
sayou_refinery-0.1.6.dist-info/METADATA,sha256=zmor5IfNcoOzqqmX2OVwudo42b-43etk20LFf5r5wkg,16989
|
|
15
|
+
sayou_refinery-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
16
|
+
sayou_refinery-0.1.6.dist-info/RECORD,,
|