sayou-refinery 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ from sayou.core.exceptions import SayouCoreError
2
+
3
+
4
+ class RefineryError(SayouCoreError):
5
+ """
6
+ Base exception for all errors within the sayou-refinery toolkit.
7
+ """
8
+
9
+ pass
10
+
11
+
12
+ class NormalizationError(RefineryError):
13
+ """
14
+ Raised when raw data cannot be converted to ContentBlocks.
15
+ (e.g., Malformed JSON, Unsupported format)
16
+ """
17
+
18
+ pass
19
+
20
+
21
+ class ProcessingError(RefineryError):
22
+ """
23
+ Raised when a processor fails to clean or transform blocks.
24
+ (e.g., PII masking failure, Imputation rule error)
25
+ """
26
+
27
+ pass
@@ -0,0 +1,27 @@
1
+ from typing import Any, Dict, List, Union
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ContentBlock(BaseModel):
7
+ """
8
+ Standard unit of content refined from raw data.
9
+
10
+ Refinery normalizes raw inputs into a list of these blocks.
11
+ Processors iterate over these blocks to clean or modify them.
12
+ """
13
+
14
+ type: str = Field(
15
+ ..., description="Block type (e.g., 'text', 'md', 'record', 'table')"
16
+ )
17
+
18
+ content: Union[str, Dict[str, Any], List[Any]] = Field(
19
+ ..., description="The actual data payload"
20
+ )
21
+
22
+ metadata: Dict[str, Any] = Field(
23
+ default_factory=dict, description="Context info (page_num, source_id, etc.)"
24
+ )
25
+
26
+ class Config:
27
+ arbitrary_types_allowed = True
@@ -0,0 +1,62 @@
1
+ from abc import abstractmethod
2
+ from typing import Any, List
3
+
4
+ from sayou.core.base_component import BaseComponent
5
+ from sayou.core.decorators import measure_time
6
+
7
+ from ..core.exceptions import NormalizationError
8
+ from ..core.schemas import ContentBlock
9
+
10
+
11
+ class BaseNormalizer(BaseComponent):
12
+ """
13
+ (Tier 1) Abstract base class for converting raw input into ContentBlocks.
14
+
15
+ Normalizers are responsible for structural transformation:
16
+ Raw Data (JSON, HTML, DB Row) -> List[ContentBlock]
17
+ """
18
+
19
+ component_name = "BaseNormalizer"
20
+ SUPPORTED_TYPES = []
21
+
22
+ @measure_time
23
+ def normalize(self, raw_data: Any) -> List[ContentBlock]:
24
+ """
25
+ Execute the normalization process.
26
+
27
+ Args:
28
+ raw_data: The raw input data from Connector or Document.
29
+
30
+ Returns:
31
+ List[ContentBlock]: A list of normalized content blocks.
32
+
33
+ Raises:
34
+ NormalizationError: If transformation fails.
35
+ """
36
+ self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
37
+ try:
38
+ blocks = self._do_normalize(raw_data)
39
+ if not isinstance(blocks, list):
40
+ raise NormalizationError(f"Output must be a list, got {type(blocks)}")
41
+
42
+ return blocks
43
+
44
+ except Exception as e:
45
+ wrapped_error = NormalizationError(
46
+ f"[{self.component_name}] Failed: {str(e)}"
47
+ )
48
+ self.logger.error(wrapped_error, exc_info=True)
49
+ raise wrapped_error
50
+
51
+ @abstractmethod
52
+ def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
53
+ """
54
+ [Abstract Hook] Implement logic to convert specific raw format to ContentBlocks.
55
+
56
+ Args:
57
+ raw_data: The raw input.
58
+
59
+ Returns:
60
+ List[ContentBlock]: The standardized blocks.
61
+ """
62
+ raise NotImplementedError
@@ -0,0 +1,57 @@
1
+ from abc import abstractmethod
2
+ from typing import List
3
+
4
+ from sayou.core.base_component import BaseComponent
5
+ from sayou.core.decorators import measure_time
6
+
7
+ from ..core.exceptions import ProcessingError
8
+ from ..core.schemas import ContentBlock
9
+
10
+
11
+ class BaseProcessor(BaseComponent):
12
+ """
13
+ (Tier 1) Abstract base class for processing/cleaning ContentBlocks.
14
+
15
+ Processors operate on data that is already normalized. They can modify content
16
+ (e.g., PII masking, Imputation) or filter out blocks (e.g., Deduplication).
17
+ """
18
+
19
+ component_name = "BaseProcessor"
20
+
21
+ @measure_time
22
+ def process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
23
+ """
24
+ Execute the processing logic on a list of blocks.
25
+
26
+ Args:
27
+ blocks: Input list of ContentBlocks.
28
+
29
+ Returns:
30
+ List[ContentBlock]: Processed list of ContentBlocks.
31
+
32
+ Raises:
33
+ ProcessingError: If processing logic fails.
34
+ """
35
+ try:
36
+ if not blocks:
37
+ return []
38
+
39
+ return self._do_process(blocks)
40
+
41
+ except Exception as e:
42
+ wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
43
+ self.logger.error(wrapped_error, exc_info=True)
44
+ raise wrapped_error
45
+
46
+ @abstractmethod
47
+ def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
48
+ """
49
+ [Abstract Hook] Implement cleaning/filtering logic.
50
+
51
+ Args:
52
+ blocks: List of input ContentBlocks.
53
+
54
+ Returns:
55
+ List[ContentBlock]: Modified list of ContentBlocks.
56
+ """
57
+ raise NotImplementedError
@@ -0,0 +1,307 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from ..core.exceptions import NormalizationError
4
+ from ..core.schemas import ContentBlock
5
+ from ..interfaces.base_normalizer import BaseNormalizer
6
+
7
+
8
+ class DocMarkdownNormalizer(BaseNormalizer):
9
+ """
10
+ (Tier 2) Normalizes a Sayou Document Dictionary into Markdown ContentBlocks.
11
+
12
+ This engine parses the structured dictionary output from 'sayou-document' and
13
+ converts individual elements (Text, Table, Image, Chart) into semantically
14
+ rich Markdown blocks. It also handles metadata conversion to Frontmatter.
15
+ """
16
+
17
+ component_name = "DocMarkdownNormalizer"
18
+ SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
19
+
20
+ def initialize(
21
+ self,
22
+ include_headers: bool = True,
23
+ include_footers: bool = False,
24
+ **kwargs,
25
+ ):
26
+ """
27
+ Configure the normalizer's behavior regarding document structure.
28
+
29
+ Args:
30
+ include_headers (bool): If True, processes elements found in page headers.
31
+ include_footers (bool): If True, processes elements found in page footers.
32
+ **kwargs: Additional configuration parameters passed to parent.
33
+ """
34
+ super().initialize(**kwargs)
35
+ self.include_headers = include_headers
36
+ self.include_footers = include_footers
37
+
38
+ def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
39
+ """
40
+ Execute the normalization logic on the document dictionary.
41
+
42
+ Args:
43
+ raw_data (Any): The input dictionary adhering to Sayou Document Schema.
44
+
45
+ Returns:
46
+ List[ContentBlock]: A list of normalized content blocks (mostly 'md' type).
47
+
48
+ Raises:
49
+ NormalizationError: If `raw_data` is not a valid dictionary.
50
+ """
51
+ if not isinstance(raw_data, dict):
52
+ raise NormalizationError(
53
+ f"Input must be a Dictionary, got {type(raw_data).__name__}"
54
+ )
55
+
56
+ doc_data = raw_data
57
+ blocks: List[ContentBlock] = []
58
+
59
+ if "metadata" in doc_data and doc_data["metadata"]:
60
+ blocks.extend(self._handle_doc_metadata(doc_data))
61
+
62
+ for page in doc_data.get("pages", []):
63
+ if self.include_headers and "header_elements" in page:
64
+ for element in page.get("header_elements", []):
65
+ blocks.extend(
66
+ self._handle_element(element, is_header=True, is_footer=False)
67
+ )
68
+
69
+ for element in page.get("elements", []):
70
+ blocks.extend(
71
+ self._handle_element(element, is_header=False, is_footer=False)
72
+ )
73
+
74
+ if self.include_footers and "footer_elements" in page:
75
+ for element in page.get("footer_elements", []):
76
+ # T2의 기본 규칙: include_footers가 True여도 _handle_element에서
77
+ # is_footer=True 플래그를 보고 무시할 수 있음 (T3가 오버라이드 가능)
78
+ blocks.extend(
79
+ self._handle_element(element, is_header=False, is_footer=True)
80
+ )
81
+
82
+ return blocks
83
+
84
+ def _handle_element(
85
+ self, element: Dict[str, Any], is_header: bool, is_footer: bool
86
+ ) -> List[ContentBlock]:
87
+ """
88
+ Dispatch the element to specific handlers based on its 'type' field.
89
+
90
+ Args:
91
+ element (Dict[str, Any]): The element dictionary.
92
+ is_header (bool): True if the element is part of the page header.
93
+ is_footer (bool): True if the element is part of the page footer.
94
+
95
+ Returns:
96
+ List[ContentBlock]: The resulting block(s) from the element.
97
+ """
98
+ if is_footer and not self.include_footers:
99
+ return []
100
+
101
+ elem_type = element.get("type")
102
+
103
+ if elem_type == "text":
104
+ return self._handle_text(element, is_header, is_footer)
105
+
106
+ if elem_type == "table":
107
+ return self._handle_table(element, is_header, is_footer)
108
+
109
+ if elem_type == "image":
110
+ return self._handle_image(element, is_header, is_footer)
111
+
112
+ if elem_type == "chart":
113
+ return self._handle_chart(element, is_header, is_footer)
114
+
115
+ return []
116
+
117
+ def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[ContentBlock]:
118
+ """
119
+ Convert document-level metadata into a Markdown Frontmatter block.
120
+
121
+ Args:
122
+ doc_data (Dict[str, Any]): The root document dictionary containing 'metadata'.
123
+
124
+ Returns:
125
+ List[ContentBlock]: A single block containing YAML-like frontmatter.
126
+ """
127
+ md_frontmatter = "---\n"
128
+ metadata = doc_data.get("metadata", {})
129
+
130
+ title = metadata.get("title")
131
+ author = metadata.get("author")
132
+
133
+ if title:
134
+ md_frontmatter += f"title: {title}\n"
135
+ if author:
136
+ md_frontmatter += f"author: {author}\n"
137
+ md_frontmatter += "---\n\n"
138
+
139
+ return [
140
+ ContentBlock(
141
+ type="md",
142
+ content=md_frontmatter,
143
+ metadata={"page_num": 0, "id": "metadata", "is_footer": False},
144
+ )
145
+ ]
146
+
147
+ def _handle_text(
148
+ self, element: Dict[str, Any], is_header: bool, is_footer: bool
149
+ ) -> List[ContentBlock]:
150
+ """
151
+ Convert a text element to a Markdown block, handling headings and lists.
152
+
153
+ Uses 'semantic_type' (heading/list) and 'level' attributes to generate
154
+ appropriate Markdown syntax (e.g., '# Title', '- Item').
155
+ """
156
+ text = element.get("text", "").strip()
157
+ if not text:
158
+ return []
159
+
160
+ raw_attrs = element.get("raw_attributes", {})
161
+ semantic_type = raw_attrs.get("semantic_type")
162
+
163
+ content = None
164
+
165
+ # 1. '●' (List) 처리
166
+ if semantic_type == "list":
167
+ level = raw_attrs.get("list_level", 0)
168
+ indent = " " * level
169
+ content = f"{indent}- {text}"
170
+
171
+ # 2. 'Heading 1-9' (제목) 처리
172
+ elif semantic_type == "heading":
173
+ level = raw_attrs.get("heading_level", 1)
174
+ hashes = "#" * level
175
+ content = f"{hashes} {text}"
176
+
177
+ # 3. PPT 플레이스홀더 (레거시 호환)
178
+ elif raw_attrs.get("placeholder_type") == "TITLE":
179
+ content = f"# {text}"
180
+
181
+ # 4. 그 외 (기본 텍스트)
182
+ else:
183
+ content = text
184
+
185
+ return [
186
+ ContentBlock(
187
+ type="md",
188
+ content=content,
189
+ metadata={
190
+ "page_num": element.get("meta", {}).get("page_num"),
191
+ "id": element.get("id"),
192
+ "style": raw_attrs.get("style"),
193
+ "is_footer": is_footer,
194
+ },
195
+ )
196
+ ]
197
+
198
+ def _handle_table(
199
+ self, element: Dict[str, Any], is_header: bool, is_footer: bool
200
+ ) -> List[ContentBlock]:
201
+ """
202
+ Convert a table element into a Markdown table representation.
203
+
204
+ Args:
205
+ element (Dict[str, Any]): Must contain 'data' (2D list).
206
+ """
207
+ md_table = ""
208
+ table_data = element.get("data", [])
209
+
210
+ if not table_data:
211
+ return []
212
+
213
+ max_cols = 0
214
+ for row in table_data:
215
+ if row:
216
+ max_cols = max(max_cols, len(row))
217
+
218
+ if max_cols == 0:
219
+ return []
220
+
221
+ # 1. 헤더 행 (첫 번째 행)
222
+ header = table_data[0]
223
+ header_cells = list(map(str, header)) + [""] * (max_cols - len(header))
224
+ md_table += "| " + " | ".join(header_cells) + " |\n"
225
+
226
+ # 2. 구분자 행 (최대 열 개수 기준)
227
+ md_table += "| " + " | ".join(["---"] * max_cols) + " |\n"
228
+
229
+ # 3. 본문 행 (두 번째 행부터)
230
+ for row in table_data[1:]:
231
+ body_cells = list(map(str, row)) + [""] * (max_cols - len(row))
232
+ md_table += "| " + " | ".join(body_cells) + " |\n"
233
+
234
+ return [
235
+ ContentBlock(
236
+ type="md",
237
+ content=md_table.strip(),
238
+ metadata={
239
+ "page_num": element.get("meta", {}).get("page_num"),
240
+ "id": element.get("id"),
241
+ "is_footer": is_footer,
242
+ },
243
+ )
244
+ ]
245
+
246
+ def _handle_image(
247
+ self, element: Dict[str, Any], is_header: bool, is_footer: bool
248
+ ) -> List[ContentBlock]:
249
+ """
250
+ Process an image element.
251
+
252
+ Depending on implementation, this might return an 'image_base64' block
253
+ or a Markdown image link if an external URL is provided.
254
+ """
255
+ image_base64 = element.get("image_base64")
256
+ image_base64 = element.get("image_base64")
257
+ if not image_base64:
258
+ return []
259
+
260
+ ocr_text = (element.get("ocr_text") or "").strip()
261
+ if not ocr_text:
262
+ alt_text = "image"
263
+ else:
264
+ alt_text = ocr_text
265
+
266
+ img_format = element.get("image_format", "png")
267
+
268
+ return [
269
+ ContentBlock(
270
+ type="image_base64",
271
+ content=image_base64,
272
+ metadata={
273
+ "page_num": element.get("meta", {}).get("page_num"),
274
+ "id": element.get("id"),
275
+ "is_footer": is_footer,
276
+ "alt_text": alt_text,
277
+ "format": img_format,
278
+ },
279
+ )
280
+ ]
281
+
282
+ def _handle_chart(
283
+ self, element: Dict[str, Any], is_header: bool, is_footer: bool
284
+ ) -> List[ContentBlock]:
285
+ """
286
+ Convert a chart element into its text representation.
287
+
288
+ Uses the 'text_representation' field from the element to create
289
+ a descriptive text block for LLM consumption.
290
+ """
291
+ text_rep = element.get("text_representation")
292
+ if not text_rep:
293
+ return []
294
+
295
+ content = f"--- Chart Data ---\n{text_rep}\n--------------------\n"
296
+
297
+ return [
298
+ ContentBlock(
299
+ type="md",
300
+ content=content,
301
+ metadata={
302
+ "page_num": element.get("meta", {}).get("page_num"),
303
+ "id": element.get("id"),
304
+ "is_footer": is_footer,
305
+ },
306
+ )
307
+ ]
@@ -0,0 +1,59 @@
1
+ try:
2
+ from bs4 import BeautifulSoup
3
+ except ImportError:
4
+ BeautifulSoup = None
5
+
6
+ from typing import Any, List
7
+
8
+ from ..core.exceptions import NormalizationError
9
+ from ..core.schemas import ContentBlock
10
+ from ..interfaces.base_normalizer import BaseNormalizer
11
+
12
+
13
+ class HtmlTextNormalizer(BaseNormalizer):
14
+ """
15
+ (Tier 2) Converts HTML string into a clean Text ContentBlock.
16
+
17
+ Uses BeautifulSoup to strip tags, scripts, and styles, returning only
18
+ the visible text content while preserving paragraph structure.
19
+ """
20
+
21
+ component_name = "HtmlTextNormalizer"
22
+ SUPPORTED_TYPES = ["html"]
23
+
24
+ def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
25
+ """
26
+ Parse HTML and extract text.
27
+
28
+ Args:
29
+ raw_data (Any): The input HTML string.
30
+
31
+ Returns:
32
+ List[ContentBlock]: A single block of type 'text'.
33
+
34
+ Raises:
35
+ ImportError: If BeautifulSoup4 is not installed.
36
+ NormalizationError: If input is not a string.
37
+ """
38
+ if not BeautifulSoup:
39
+ raise ImportError("BeautifulSoup4 is required for HtmlTextNormalizer.")
40
+
41
+ if not isinstance(raw_data, str):
42
+ raise NormalizationError(
43
+ f"Input must be HTML string, got {type(raw_data)}."
44
+ )
45
+
46
+ soup = BeautifulSoup(raw_data, "html.parser")
47
+
48
+ for tag in soup(["script", "style", "noscript", "iframe"]):
49
+ tag.extract()
50
+
51
+ text = soup.get_text(separator="\n")
52
+
53
+ import re
54
+
55
+ text = re.sub(r"\n{3,}", "\n\n", text).strip()
56
+
57
+ return [
58
+ ContentBlock(type="text", content=text, metadata={"source_type": "html"})
59
+ ]
@@ -0,0 +1,66 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from ..core.exceptions import NormalizationError
4
+ from ..core.schemas import ContentBlock
5
+ from ..interfaces.base_normalizer import BaseNormalizer
6
+
7
+
8
+ class RecordNormalizer(BaseNormalizer):
9
+ """
10
+ (Tier 2) Converts structured data (Dict/List) into 'record' ContentBlocks.
11
+
12
+ Suitable for processing database rows, CSV records, or JSON API responses.
13
+ Each dictionary becomes a separate ContentBlock of type 'record'.
14
+ """
15
+
16
+ component_name = "RecordNormalizer"
17
+ SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
18
+
19
+ def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
20
+ """
21
+ Convert dict or list of dicts into record blocks.
22
+
23
+ Args:
24
+ raw_data (Any): A Dictionary or a List of Dictionaries.
25
+
26
+ Returns:
27
+ List[ContentBlock]: Blocks of type 'record'.
28
+ """
29
+ blocks = []
30
+
31
+ # Case 1: Single Dictionary
32
+ if isinstance(raw_data, dict):
33
+ blocks.append(self._create_block(raw_data))
34
+
35
+ # Case 2: List of Dictionaries
36
+ elif isinstance(raw_data, list):
37
+ for item in raw_data:
38
+ if isinstance(item, dict):
39
+ blocks.append(self._create_block(item))
40
+ else:
41
+ self._log(
42
+ f"Skipping non-dict item in list: {type(item)}", level="warning"
43
+ )
44
+
45
+ else:
46
+ raise NormalizationError(
47
+ f"Input must be Dict or List[Dict], got {type(raw_data)}"
48
+ )
49
+
50
+ return blocks
51
+
52
+ def _create_block(self, data: Dict[str, Any]) -> ContentBlock:
53
+ """
54
+ Helper to wrap a single dictionary into a ContentBlock.
55
+
56
+ Args:
57
+ data (Dict[str, Any]): The data record.
58
+
59
+ Returns:
60
+ ContentBlock: A block with type='record' and content=data.
61
+ """
62
+ return ContentBlock(
63
+ type="record",
64
+ content=data,
65
+ metadata={"fields": list(data.keys())},
66
+ )
@@ -0,0 +1,111 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from sayou.core.base_component import BaseComponent
4
+ from sayou.core.decorators import safe_run
5
+
6
+ from .core.exceptions import RefineryError
7
+ from .core.schemas import ContentBlock
8
+ from .interfaces.base_normalizer import BaseNormalizer
9
+ from .interfaces.base_processor import BaseProcessor
10
+ from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
11
+ from .normalizer.html_text_normalizer import HtmlTextNormalizer
12
+ from .normalizer.record_normalizer import RecordNormalizer
13
+ from .processor.deduplicator import Deduplicator
14
+ from .processor.imputer import Imputer
15
+ from .processor.outlier_handler import OutlierHandler
16
+ from .processor.pii_masker import PiiMasker
17
+ from .processor.text_cleaner import TextCleaner
18
+
19
+
20
+ class RefineryPipeline(BaseComponent):
21
+ """
22
+ Orchestrates the data refinement process.
23
+ 1. Selects a Normalizer to convert raw data into standard ContentBlocks.
24
+ 2. Runs a chain of Processors to clean and transform the blocks.
25
+ """
26
+
27
+ component_name = "RefineryPipeline"
28
+
29
+ def __init__(
30
+ self,
31
+ extra_normalizers: Optional[List[BaseNormalizer]] = None,
32
+ processors: Optional[List[BaseProcessor]] = None,
33
+ ):
34
+ super().__init__()
35
+ self.normalizers: Dict[str, BaseNormalizer] = {}
36
+
37
+ # 1. Register Default Normalizers
38
+ defaults = [DocMarkdownNormalizer(), HtmlTextNormalizer(), RecordNormalizer()]
39
+ self._register(defaults)
40
+
41
+ # 2. Register User Extras
42
+ if extra_normalizers:
43
+ self._register(extra_normalizers)
44
+
45
+ # 3. Setup Processors Chain
46
+ self.processors = (
47
+ processors
48
+ if processors is not None
49
+ else [
50
+ TextCleaner(),
51
+ PiiMasker(),
52
+ Deduplicator(),
53
+ Imputer(),
54
+ OutlierHandler(),
55
+ ]
56
+ )
57
+
58
+ def _register(self, comps: List[BaseNormalizer]):
59
+ for c in comps:
60
+ for t in getattr(c, "SUPPORTED_TYPES", []):
61
+ self.normalizers[t] = c
62
+
63
+ @safe_run(default_return=None)
64
+ def initialize(self, **kwargs):
65
+ """
66
+ Initialize all sub-components (Normalizers and Processors).
67
+ Passes global configuration (like PII masking rules) down to components.
68
+ """
69
+ for norm in set(self.normalizers.values()):
70
+ norm.initialize(**kwargs)
71
+
72
+ for proc in self.processors:
73
+ proc.initialize(**kwargs)
74
+
75
+ self._log(
76
+ f"Refinery initialized with {len(self.processors)} processors in chain."
77
+ )
78
+
79
+ def run(
80
+ self, raw_data: Any, source_type: str = "standard_doc"
81
+ ) -> List[ContentBlock]:
82
+ """
83
+ Execute the refinement pipeline.
84
+
85
+ Args:
86
+ raw_data: The raw input data (dict, html string, db row list, etc.)
87
+ source_type: The type of input data (e.g., 'standard_doc', 'html', 'json')
88
+
89
+ Returns:
90
+ List[ContentBlock]: A list of clean, normalized blocks.
91
+ """
92
+ # Step 1: Normalize (Structure Transformation)
93
+ normalizer = self.normalizers.get(source_type)
94
+ if not normalizer:
95
+ supported = list(self.normalizers.keys())
96
+ raise RefineryError(
97
+ f"Unknown source_type '{source_type}'. Supported: {supported}"
98
+ )
99
+
100
+ try:
101
+ blocks = normalizer.normalize(raw_data)
102
+ except Exception as e:
103
+ self.logger.error(f"Normalization step failed: {e}")
104
+ return []
105
+
106
+ # Step 2: Process (Content Cleaning)
107
+ # Processors modify blocks in-place or return new lists
108
+ for processor in self.processors:
109
+ blocks = processor.process(blocks)
110
+
111
+ return blocks
@@ -0,0 +1,48 @@
1
+ import json
2
+ from typing import List, Set
3
+
4
+ from ..core.schemas import ContentBlock
5
+ from ..interfaces.base_processor import BaseProcessor
6
+
7
+
8
+ class Deduplicator(BaseProcessor):
9
+ """
10
+ (Tier 2) Removes duplicate blocks based on content hashing.
11
+
12
+ It computes a hash of the content for each block and filters out
13
+ subsequent blocks that match an already seen hash.
14
+ """
15
+
16
+ component_name = "Deduplicator"
17
+
18
+ def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
19
+ """
20
+ Iterate through blocks and remove duplicates.
21
+
22
+ Args:
23
+ blocks (List[ContentBlock]): The input list of blocks.
24
+
25
+ Returns:
26
+ List[ContentBlock]: A new list with duplicates removed.
27
+ """
28
+ seen_hashes: Set[int] = set()
29
+ unique_blocks: List[ContentBlock] = []
30
+
31
+ for block in blocks:
32
+ # Generate stable hash key
33
+ if isinstance(block.content, dict):
34
+ content_str = json.dumps(block.content, sort_keys=True)
35
+ else:
36
+ content_str = str(block.content)
37
+
38
+ if len(content_str) < 5:
39
+ unique_blocks.append(block)
40
+ continue
41
+
42
+ content_hash = hash(content_str)
43
+
44
+ if content_hash not in seen_hashes:
45
+ seen_hashes.add(content_hash)
46
+ unique_blocks.append(block)
47
+
48
+ return unique_blocks
@@ -0,0 +1,51 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from ..core.schemas import ContentBlock
4
+ from ..interfaces.base_processor import BaseProcessor
5
+
6
+
7
+ class Imputer(BaseProcessor):
8
+ """
9
+ (Tier 2) Fills missing values in 'record' type blocks using defined rules.
10
+
11
+ Only operates on blocks with type='record' where the content is a dictionary.
12
+ """
13
+
14
+ component_name = "Imputer"
15
+
16
+ def initialize(self, imputation_rules: Dict[str, Any] = None, **kwargs):
17
+ """
18
+ Set imputation rules.
19
+
20
+ Args:
21
+ rules (Dict[str, Any]): Mapping of field names to default values.
22
+ Example: {"category": "Unknown", "price": 0.0}
23
+ **kwargs: Additional arguments.
24
+ """
25
+ self.rules = imputation_rules or {}
26
+ if not self.rules:
27
+ self._log("Imputer initialized with no rules.", level="warning")
28
+
29
+ def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
30
+ """
31
+ Apply imputation rules to record blocks.
32
+
33
+ Args:
34
+ blocks (List[ContentBlock]): Input blocks.
35
+
36
+ Returns:
37
+ List[ContentBlock]: Blocks with missing values filled.
38
+ """
39
+ for block in blocks:
40
+ if block.type != "record" or not isinstance(block.content, dict):
41
+ continue
42
+
43
+ record = block.content
44
+
45
+ for field, default_value in self.rules.items():
46
+ if record.get(field) is None:
47
+ record[field] = default_value
48
+
49
+ block.content = record
50
+
51
+ return blocks
@@ -0,0 +1,84 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from ..core.schemas import ContentBlock
4
+ from ..interfaces.base_processor import BaseProcessor
5
+
6
+
7
+ class OutlierHandler(BaseProcessor):
8
+ """
9
+ (Tier 2) Handles numerical outliers in 'record' blocks.
10
+
11
+ Can either 'drop' the entire block or 'clamp' the value to a boundary
12
+ if a field violates the defined min/max rules.
13
+ """
14
+
15
+ component_name = "OutlierHandler"
16
+
17
+ def initialize(self, outlier_rules: Dict[str, Dict[str, Any]] = None, **kwargs):
18
+ """
19
+ Set outlier handling rules.
20
+
21
+ Args:
22
+ rules (Dict[str, Dict[str, Any]]): Mapping of field names to constraints.
23
+ Example:
24
+ {
25
+ "age": {"min": 0, "max": 120, "action": "drop"},
26
+ "score": {"min": 0, "max": 100, "action": "clamp"}
27
+ }
28
+ **kwargs: Additional arguments.
29
+ """
30
+ self.rules = outlier_rules or {}
31
+
32
+ def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
33
+ """
34
+ Check numerical fields against rules and filter/modify blocks.
35
+
36
+ Args:
37
+ blocks (List[ContentBlock]): Input blocks.
38
+
39
+ Returns:
40
+ List[ContentBlock]: Filtered or modified list of blocks.
41
+ """
42
+ valid_blocks = []
43
+
44
+ for block in blocks:
45
+ if block.type != "record" or not isinstance(block.content, dict):
46
+ valid_blocks.append(block)
47
+ continue
48
+
49
+ record = block.content
50
+ should_drop = False
51
+
52
+ for field, rule in self.rules.items():
53
+ val = record.get(field)
54
+ if val is None:
55
+ continue
56
+
57
+ try:
58
+ val_f = float(val)
59
+ min_v = rule.get("min")
60
+ max_v = rule.get("max")
61
+ action = rule.get("action", "drop")
62
+
63
+ if min_v is not None and val_f < min_v:
64
+ if action == "drop":
65
+ should_drop = True
66
+ break
67
+ elif action == "clamp":
68
+ record[field] = min_v
69
+
70
+ if max_v is not None and val_f > max_v:
71
+ if action == "drop":
72
+ should_drop = True
73
+ break
74
+ elif action == "clamp":
75
+ record[field] = max_v
76
+
77
+ except (ValueError, TypeError):
78
+ continue
79
+
80
+ if not should_drop:
81
+ block.content = record
82
+ valid_blocks.append(block)
83
+
84
+ return valid_blocks
@@ -0,0 +1,52 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from ..core.schemas import ContentBlock
5
+ from ..interfaces.base_processor import BaseProcessor
6
+
7
+
8
+ class PiiMasker(BaseProcessor):
9
+ """
10
+ (Tier 2) Masks Personally Identifiable Information (PII) in text blocks.
11
+
12
+ Uses Regex patterns to identify and redact sensitive data like emails
13
+ and phone numbers in 'text' and 'md' blocks.
14
+ """
15
+
16
+ component_name = "PiiMasker"
17
+
18
+ def initialize(self, mask_email: bool = True, mask_phone: bool = True, **kwargs):
19
+ """
20
+ Configure masking targets.
21
+
22
+ Args:
23
+ mask_email (bool): Whether to mask email addresses (default: True).
24
+ mask_phone (bool): Whether to mask phone numbers (default: True).
25
+ **kwargs: Additional arguments.
26
+ """
27
+ self.mask_email = mask_email
28
+ self.mask_phone = mask_phone
29
+ self._email_re = re.compile(r"[\w\.-]+@[\w\.-]+")
30
+ # Simple phone regex (customizable)
31
+ self._phone_re = re.compile(r"\d{3}[-\.\s]??\d{3,4}[-\.\s]??\d{4}")
32
+
33
+ def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
34
+ """
35
+ Apply masking regex to text content.
36
+
37
+ Args:
38
+ blocks (List[ContentBlock]): Input blocks.
39
+
40
+ Returns:
41
+ List[ContentBlock]: Blocks with sensitive info replaced by tokens.
42
+ """
43
+ for block in blocks:
44
+ if block.type not in ["text", "md"] or not isinstance(block.content, str):
45
+ continue
46
+
47
+ if self.mask_email:
48
+ block.content = self._email_re.sub("[EMAIL]", block.content)
49
+ if self.mask_phone:
50
+ block.content = self._phone_re.sub("[PHONE]", block.content)
51
+
52
+ return blocks
@@ -0,0 +1,61 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from ..core.schemas import ContentBlock
5
+ from ..interfaces.base_processor import BaseProcessor
6
+
7
+
8
+ class TextCleaner(BaseProcessor):
9
+ """
10
+ (Tier 2) Cleans text content using regex and whitespace normalization.
11
+
12
+ Operates on 'text' and 'md' blocks to remove noise characters or custom patterns.
13
+ """
14
+
15
+ component_name = "TextCleaner"
16
+
17
+ def initialize(
18
+ self, patterns: List[str] = None, normalize_space: bool = True, **kwargs
19
+ ):
20
+ """
21
+ Configure cleaning patterns.
22
+
23
+ Args:
24
+ patterns (List[str]): List of regex patterns to remove from text.
25
+ normalize_space (bool): If True, collapses multiple spaces/tabs/newlines.
26
+ **kwargs: Additional arguments.
27
+ """
28
+ self.normalize_space = normalize_space
29
+ self.patterns = [re.compile(p) for p in (patterns or [])]
30
+ self._space_re = re.compile(r"[ \t]+")
31
+
32
+ def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
33
+ """
34
+ Apply cleaning logic to text blocks.
35
+
36
+ Args:
37
+ blocks (List[ContentBlock]): Input blocks.
38
+
39
+ Returns:
40
+ List[ContentBlock]: Cleaned blocks.
41
+ """
42
+ for block in blocks:
43
+ if block.type not in ["text", "md"]:
44
+ continue
45
+
46
+ if not isinstance(block.content, str):
47
+ continue
48
+
49
+ text = block.content
50
+
51
+ # 1. Custom Patterns Removal
52
+ for pat in self.patterns:
53
+ text = pat.sub("", text)
54
+
55
+ # 2. Whitespace Normalization
56
+ if self.normalize_space:
57
+ text = self._space_re.sub(" ", text)
58
+
59
+ block.content = text.strip()
60
+
61
+ return blocks
@@ -0,0 +1,310 @@
1
+ Metadata-Version: 2.4
2
+ Name: sayou-refinery
3
+ Version: 0.1.6
4
+ Summary: Refinery components for the Sayou Data Platform
5
+ Project-URL: Homepage, https://www.sayouzone.com/
6
+ Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
7
+ Project-URL: Repository, https://github.com/sayouzone/sayou-fabric
8
+ Author-email: Sayouzone <contact@sayouzone.com>
9
+ License: Apache License
10
+ Version 2.0, January 2004
11
+ http://www.apache.org/licenses/
12
+
13
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
14
+
15
+ 1. Definitions.
16
+
17
+ "License" shall mean the terms and conditions for use, reproduction,
18
+ and distribution as defined by Sections 1 through 9 of this document.
19
+
20
+ "Licensor" shall mean the copyright owner or entity authorized by
21
+ the copyright owner that is granting the License.
22
+
23
+ "Legal Entity" shall mean the union of the acting entity and all
24
+ other entities that control, are controlled by, or are under common
25
+ control with that entity. For the purposes of this definition,
26
+ "control" means (i) the power, direct or indirect, to cause the
27
+ direction or management of such entity, whether by contract or
28
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
29
+ outstanding shares, or (iii) beneficial ownership of such entity.
30
+
31
+ "You" (or "Your") shall mean an individual or Legal Entity
32
+ exercising permissions granted by this License.
33
+
34
+ "Source" form shall mean the preferred form for making modifications,
35
+ including but not limited to software source code, documentation
36
+ source, and configuration files.
37
+
38
+ "Object" form shall mean any form resulting from mechanical
39
+ transformation or translation of a Source form, including but
40
+ not limited to compiled object code, generated documentation,
41
+ and conversions to other media types.
42
+
43
+ "Work" shall mean the work of authorship, whether in Source or
44
+ Object form, made available under the License, as indicated by a
45
+ copyright notice that is included in or attached to the work
46
+ (an example is provided in the Appendix below).
47
+
48
+ "Derivative Works" shall mean any work, whether in Source or Object
49
+ form, that is based on (or derived from) the Work and for which the
50
+ editorial revisions, annotations, elaborations, or other modifications
51
+ represent, as a whole, an original work of authorship. For the purposes
52
+ of this License, Derivative Works shall not include works that remain
53
+ separable from, or merely link (or bind by name) to the interfaces of,
54
+ the Work and Derivative Works thereof.
55
+
56
+ "Contribution" shall mean any work of authorship, including
57
+ the original version of the Work and any modifications or additions
58
+ to that Work or Derivative Works thereof, that is intentionally
59
+ submitted to Licensor for inclusion in the Work by the copyright owner
60
+ or by an individual or Legal Entity authorized to submit on behalf of
61
+ the copyright owner. For the purposes of this definition, "submitted"
62
+ means any form of electronic, verbal, or written communication sent
63
+ to the Licensor or its representatives, including but not limited to
64
+ communication on electronic mailing lists, source code control systems,
65
+ and issue tracking systems that are managed by, or on behalf of, the
66
+ Licensor for the purpose of discussing and improving the Work, but
67
+ excluding communication that is conspicuously marked or otherwise
68
+ designated in writing by the copyright owner as "Not a Contribution."
69
+
70
+ "Contributor" shall mean Licensor and any individual or Legal Entity
71
+ on behalf of whom a Contribution has been received by Licensor and
72
+ subsequently incorporated within the Work.
73
+
74
+ 2. Grant of Copyright License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ copyright license to reproduce, prepare Derivative Works of,
78
+ publicly display, publicly perform, sublicense, and distribute the
79
+ Work and such Derivative Works in Source or Object form.
80
+
81
+ 3. Grant of Patent License. Subject to the terms and conditions of
82
+ this License, each Contributor hereby grants to You a perpetual,
83
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
84
+ (except as stated in this section) patent license to make, have made,
85
+ use, offer to sell, sell, import, and otherwise transfer the Work,
86
+ where such license applies only to those patent claims licensable
87
+ by such Contributor that are necessarily infringed by their
88
+ Contribution(s) alone or by combination of their Contribution(s)
89
+ with the Work to which such Contribution(s) was submitted. If You
90
+ institute patent litigation against any entity (including a
91
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
92
+ or a Contribution incorporated within the Work constitutes direct
93
+ or contributory patent infringement, then any patent licenses
94
+ granted to You under this License for that Work shall terminate
95
+ as of the date such litigation is filed.
96
+
97
+ 4. Redistribution. You may reproduce and distribute copies of the
98
+ Work or Derivative Works thereof in any medium, with or without
99
+ modifications, and in Source or Object form, provided that You
100
+ meet the following conditions:
101
+
102
+ (a) You must give any other recipients of the Work or
103
+ Derivative Works a copy of this License; and
104
+
105
+ (b) You must cause any modified files to carry prominent notices
106
+ stating that You changed the files; and
107
+
108
+ (c) You must retain, in the Source form of any Derivative Works
109
+ that You distribute, all copyright, patent, trademark, and
110
+ attribution notices from the Source form of the Work,
111
+ excluding those notices that do not pertain to any part of
112
+ the Derivative Works; and
113
+
114
+ (d) If the Work includes a "NOTICE" text file as part of its
115
+ distribution, then any Derivative Works that You distribute must
116
+ include a readable copy of the attribution notices contained
117
+ within such NOTICE file, excluding those notices that do not
118
+ pertain to any part of the Derivative Works, in at least one
119
+ of the following places: within a NOTICE text file distributed
120
+ as part of the Derivative Works; within the Source form or
121
+ documentation, if provided along with the Derivative Works; or,
122
+ within a display generated by the Derivative Works, if and
123
+ wherever such third-party notices normally appear. The contents
124
+ of the NOTICE file are for informational purposes only and
125
+ do not modify the License. You may add Your own attribution
126
+ notices within Derivative Works that You distribute, alongside
127
+ or as an addendum to the NOTICE text from the Work, provided
128
+ that such additional attribution notices cannot be construed
129
+ as modifying the License.
130
+
131
+ You may add Your own copyright statement to Your modifications and
132
+ may provide additional or different license terms and conditions
133
+ for use, reproduction, or distribution of Your modifications, or
134
+ for any such Derivative Works as a whole, provided Your use,
135
+ reproduction, and distribution of the Work otherwise complies with
136
+ the conditions stated in this License.
137
+
138
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
139
+ any Contribution intentionally submitted for inclusion in the Work
140
+ by You to the Licensor shall be under the terms and conditions of
141
+ this License, without any additional terms or conditions.
142
+ Notwithstanding the above, nothing herein shall supersede or modify
143
+ the terms of any separate license agreement you may have executed
144
+ with Licensor regarding such Contributions.
145
+
146
+ 6. Trademarks. This License does not grant permission to use the trade
147
+ names, trademarks, service marks, or product names of the Licensor,
148
+ except as required for reasonable and customary use in describing the
149
+ origin of the Work and reproducing the content of the NOTICE file.
150
+
151
+ 7. Disclaimer of Warranty. Unless required by applicable law or
152
+ agreed to in writing, Licensor provides the Work (and each
153
+ Contributor provides its Contributions) on an "AS IS" BASIS,
154
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
155
+ implied, including, without limitation, any warranties or conditions
156
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
157
+ PARTICULAR PURPOSE. You are solely responsible for determining the
158
+ appropriateness of using or redistributing the Work and assume any
159
+ risks associated with Your exercise of permissions under this License.
160
+
161
+ 8. Limitation of Liability. In no event and under no legal theory,
162
+ whether in tort (including negligence), contract, or otherwise,
163
+ unless required by applicable law (such as deliberate and grossly
164
+ negligent acts) or agreed to in writing, shall any Contributor be
165
+ liable to You for damages, including any direct, indirect, special,
166
+ incidental, or consequential damages of any character arising as a
167
+ result of this License or out of the use or inability to use the
168
+ Work (including but not limited to damages for loss of goodwill,
169
+ work stoppage, computer failure or malfunction, or any and all
170
+ other commercial damages or losses), even if such Contributor
171
+ has been advised of the possibility of such damages.
172
+
173
+ 9. Accepting Warranty or Additional Liability. While redistributing
174
+ the Work or Derivative Works thereof, You may choose to offer,
175
+ and charge a fee for, acceptance of support, warranty, indemnity,
176
+ or other liability obligations and/or rights consistent with this
177
+ License. However, in accepting such obligations, You may act only
178
+ on Your own behalf and on Your sole responsibility, not on behalf
179
+ of any other Contributor, and only if You agree to indemnify,
180
+ defend, and hold each Contributor harmless for any liability
181
+ incurred by, or claims asserted against, such Contributor by reason
182
+ of your accepting any such warranty or additional liability.
183
+
184
+ END OF TERMS AND CONDITIONS
185
+
186
+ APPENDIX: How to apply the Apache License to your work.
187
+
188
+ To apply the Apache License to your work, attach the following
189
+ boilerplate notice, with the fields enclosed by brackets "[]"
190
+ replaced with your own identifying information. (Don't include
191
+ the brackets!) The text should be enclosed in the appropriate
192
+ comment syntax for the file format. We also recommend that a
193
+ file or class name and description of purpose be included on the
194
+ same "printed page" as the copyright notice for easier
195
+ identification within third-party archives.
196
+
197
+ Copyright [yyyy] [name of copyright owner]
198
+
199
+ Licensed under the Apache License, Version 2.0 (the "License");
200
+ you may not use this file except in compliance with the License.
201
+ You may obtain a copy of the License at
202
+
203
+ http://www.apache.org/licenses/LICENSE-2.0
204
+
205
+ Unless required by applicable law or agreed to in writing, software
206
+ distributed under the License is distributed on an "AS IS" BASIS,
207
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
208
+ See the License for the specific language governing permissions and
209
+ limitations under the License.
210
+ Classifier: License :: OSI Approved :: Apache Software License
211
+ Classifier: Operating System :: OS Independent
212
+ Classifier: Programming Language :: Python :: 3.9
213
+ Classifier: Programming Language :: Python :: 3.10
214
+ Classifier: Programming Language :: Python :: 3.11
215
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
216
+ Requires-Python: >=3.9
217
+ Requires-Dist: sayou-core~=0.1.2
218
+ Description-Content-Type: text/markdown
219
+
220
+ # sayou-refinery
221
+
222
+ [![PyPI version](https://img.shields.io/pypi/v/sayou-refinery.svg?color=blue)](https://pypi.org/project/sayou-refinery/)
223
+ [![License](https://img.shields.io/badge/License-Apache%202.0-red.svg)](https://www.apache.org/licenses/LICENSE-2.0)
224
+ [![Docs](https://img.shields.io/badge/docs-mkdocs-success.svg?logo=materialformkdocs)](https://sayouzone.github.io/sayou-fabric/library-guides/refinery/)
225
+
226
+ **The Universal Data Cleaning & Normalization Engine for Sayou Fabric.**
227
+
228
+ `sayou-refinery` acts as the "Cleaning Plant" in your data pipeline.
229
+
230
+ It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a standardized stream of **ContentBlocks**, ensuring that downstream components (like Chunkers or LLMs) receive clean, uniform data regardless of the original source format.
231
+
232
+ ## 💡 Core Philosophy
233
+
234
+ **"Flatten Structure, Polish Content."**
235
+
236
+ Refinery operates in two distinct stages to guarantee data quality:
237
+
238
+ 1. **Normalization (Shape Shifting):** Converts complex structures (nested JSON, HTML trees, DB Rows) into a linear list of `ContentBlocks`.
239
+ 2. **Processing (Cleaning):** Applies a chain of cleaning agents (Regex, Masking, Deduplication) to improve data hygiene.
240
+
241
+ ## 📦 Installation
242
+
243
+ ```bash
244
+ pip install sayou-refinery
245
+ ```
246
+
247
+ ## ⚡ Quick Start
248
+
249
+ The `RefineryPipeline` orchestrates the normalization and processing chain.
250
+
251
+ ```python
252
+ from sayou.refinery.pipeline import RefineryPipeline
253
+
254
+ def run_demo():
255
+ # 1. Initialize with specific cleaning rules
256
+ pipeline = RefineryPipeline()
257
+ pipeline.initialize(
258
+ mask_email=True,
259
+ outlier_rules={"price": {"min": 0, "max": 1000, "action": "clamp"}}
260
+ )
261
+
262
+ # 2. Raw Data (e.g., from sayou-document)
263
+ raw_doc = {
264
+ "metadata": {"title": "Test Doc"},
265
+ "pages": [{
266
+ "elements": [
267
+ {"type": "text", "text": "Contact: admin@sayou.ai"},
268
+ {"type": "text", "text": " Dirty Whitespace "}
269
+ ]
270
+ }]
271
+ }
272
+
273
+ # 3. Run Pipeline
274
+ # source_type: 'standard_doc', 'html', 'json', etc.
275
+ blocks = pipeline.run(raw_doc, source_type="standard_doc")
276
+
277
+ # 4. Result
278
+ for block in blocks:
279
+ print(f"[{block.type}] {block.content}")
280
+
281
+ # Output:
282
+ # [md_meta] --- title: Test Doc ...
283
+ # [md] Contact: [EMAIL]
284
+ # [md] Dirty Whitespace
285
+
286
+ if __name__ == "__main__":
287
+ run_demo()
288
+ ```
289
+
290
+ ## 🔑 Key Components
291
+
292
+ ### Normalizers
293
+ * **`DocMarkdownNormalizer`**: Converts Sayou Document Dicts into Markdown blocks.
294
+ * **`HtmlTextNormalizer`**: Strips HTML tags and scripts, extracting clean text.
295
+ * **`RecordNormalizer`**: Converts DB rows or JSON objects into 'record' blocks.
296
+
297
+ ### Processors
298
+ * **`TextCleaner`**: Normalizes whitespace and removes noise via regex.
299
+ * **`PiiMasker`**: Masks sensitive info like emails and phone numbers.
300
+ * **`Deduplicator`**: Removes duplicate content blocks.
301
+ * **`Imputer`**: Fills missing values in record blocks.
302
+ * **`OutlierHandler`**: Filters or clamps numerical outliers in records.
303
+
304
+ ## 🤝 Contributing
305
+
306
+ We welcome contributions for new Normalizers (e.g., `CsvNormalizer`, `LogNormalizer`) or Processors (e.g., `LangChainFilter`).
307
+
308
+ ## 📜 License
309
+
310
+ Apache 2.0 License © 2025 Sayouzone
@@ -0,0 +1,16 @@
1
+ sayou/refinery/pipeline.py,sha256=oJbygy300ounS3xL3UdCpwnmdmUTRib-W-ADsPJ1Vjs,3756
2
+ sayou/refinery/core/exceptions.py,sha256=LhY8tDk9UzqjGjy-7UPpzBSRpH4vUl3ZemmW-BssdJY,547
3
+ sayou/refinery/core/schemas.py,sha256=LhKV5X8WIiUV273OpX9y7TduQUX03qZh-rgRSMn_eCs,727
4
+ sayou/refinery/interfaces/base_normalizer.py,sha256=nYQ40IM83WXnIiSIDqhWHfiHAC0O4F9iymb7-u7cSIE,1862
5
+ sayou/refinery/interfaces/base_processor.py,sha256=A9YvD1ZwHhlK290Y77xkSeJTtBCJ53wAYI-KeuVgShM,1650
6
+ sayou/refinery/normalizer/doc_markdown_normalizer.py,sha256=ZVkTEjCesrbjWRRetD1Lp6_YjHsdDL9GNXO4yvUYIUw,10277
7
+ sayou/refinery/normalizer/html_text_normalizer.py,sha256=hX0UTbJwND0Rv-_HuGL-p4Popdrg6_m_mDkKZDYW5AE,1675
8
+ sayou/refinery/normalizer/record_normalizer.py,sha256=bzErNEVw9g-QtQiq_wdm_AxiZi_uuvtx23AL8DRjgxQ,2029
9
+ sayou/refinery/processor/deduplicator.py,sha256=yKZkaPyY4P_a-IwIK8f3XCYqgnoF0us0NbMsBDY03ic,1402
10
+ sayou/refinery/processor/imputer.py,sha256=vvaGvxQNajKSjbYr-gNHmd4HYsD4FtchJhGfnKv-cpo,1562
11
+ sayou/refinery/processor/outlier_handler.py,sha256=bWcECxwVVvfcjiy5lm3YYABUA_7lIW_Do5Q70rK-mtM,2693
12
+ sayou/refinery/processor/pii_masker.py,sha256=fbXPE2HLESQFUvNITnLp-9q2L4MEPcfIkLUUGRIva8k,1726
13
+ sayou/refinery/processor/text_cleaner.py,sha256=8_Hu6H_W__tNfwebm9cS43DRc8EExkhfpkmxslShDjU,1748
14
+ sayou_refinery-0.1.6.dist-info/METADATA,sha256=zmor5IfNcoOzqqmX2OVwudo42b-43etk20LFf5r5wkg,16989
15
+ sayou_refinery-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
16
+ sayou_refinery-0.1.6.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any