sayou-refinery 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,64 +1,126 @@
1
- from typing import Any, Dict, List, Optional
1
+ import importlib
2
+ import pkgutil
3
+ from typing import Any, Dict, List, Optional, Type
2
4
 
3
5
  from sayou.core.base_component import BaseComponent
4
6
  from sayou.core.decorators import safe_run
7
+ from sayou.core.registry import COMPONENT_REGISTRY
8
+ from sayou.core.schemas import SayouBlock
5
9
 
6
10
  from .core.exceptions import RefineryError
7
- from .core.schemas import ContentBlock
8
11
  from .interfaces.base_normalizer import BaseNormalizer
9
12
  from .interfaces.base_processor import BaseProcessor
10
- from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
11
- from .normalizer.html_text_normalizer import HtmlTextNormalizer
12
- from .normalizer.record_normalizer import RecordNormalizer
13
- from .processor.deduplicator import Deduplicator
14
- from .processor.imputer import Imputer
15
- from .processor.outlier_handler import OutlierHandler
16
- from .processor.pii_masker import PiiMasker
17
- from .processor.text_cleaner import TextCleaner
18
13
 
19
14
 
20
15
  class RefineryPipeline(BaseComponent):
21
16
  """
22
- Orchestrates the data refinement process.
23
- 1. Selects a Normalizer to convert raw data into standard ContentBlocks.
24
- 2. Runs a chain of Processors to clean and transform the blocks.
17
+ Orchestrates the data refinement process via dynamic registry.
18
+
19
+ Workflow:
20
+ 1. Normalization: Converts raw input (Document, HTML, JSON) into standard SayouBlocks.
21
+ 2. Processing: Applies a chain of processors (Cleaning, Masking, Dedup) to the blocks.
25
22
  """
26
23
 
27
24
  component_name = "RefineryPipeline"
28
25
 
29
26
  def __init__(
30
27
  self,
31
- extra_normalizers: Optional[List[BaseNormalizer]] = None,
32
- processors: Optional[List[BaseProcessor]] = None,
28
+ extra_normalizers: Optional[List[Type[BaseNormalizer]]] = None,
29
+ **kwargs,
33
30
  ):
31
+ """
32
+ Initializes the pipeline and discovers available plugins.
33
+
34
+ Args:
35
+ extra_normalizers: Optional list of custom normalizer classes to register.
36
+ **kwargs: Global configuration passed down to components.
37
+ e.g., processors=["cleaner", "pii_masker"]
38
+ """
34
39
  super().__init__()
35
- self.normalizers: Dict[str, BaseNormalizer] = {}
36
40
 
37
- # 1. Register Default Normalizers
38
- defaults = [DocMarkdownNormalizer(), HtmlTextNormalizer(), RecordNormalizer()]
39
- self._register(defaults)
41
+ self.normalizer_cls_map: Dict[str, Type[BaseNormalizer]] = {}
42
+ self.processor_cls_map: Dict[str, Type[BaseProcessor]] = {}
43
+
44
+ self._register("sayou.refinery.normalizer")
45
+ self._register("sayou.refinery.processor")
46
+ self._register("sayou.refinery.plugins")
47
+
48
+ self._load_from_registry()
40
49
 
41
- # 2. Register User Extras
42
50
  if extra_normalizers:
43
- self._register(extra_normalizers)
44
-
45
- # 3. Setup Processors Chain
46
- self.processors = (
47
- processors
48
- if processors is not None
49
- else [
50
- TextCleaner(),
51
- PiiMasker(),
52
- Deduplicator(),
53
- Imputer(),
54
- OutlierHandler(),
55
- ]
56
- )
51
+ for cls in extra_normalizers:
52
+ self._register_manual(cls)
53
+
54
+ self.global_config = kwargs
55
+
56
+ self.initialize(**kwargs)
57
+
58
+ def _register_manual(self, cls):
59
+ """
60
+ Safely registers a user-provided class.
61
+ """
62
+ if not isinstance(cls, type):
63
+ raise TypeError(
64
+ f"Invalid normalizer: {cls}. "
65
+ f"Please pass the CLASS itself (e.g., MyNormalizer), not an instance (MyNormalizer())."
66
+ )
67
+
68
+ name = getattr(cls, "component_name", cls.__name__)
69
+ self.normalizer_cls_map[name] = cls
70
+
71
+ @classmethod
72
+ def process(
73
+ cls,
74
+ raw_data: Any,
75
+ strategy: str = "auto",
76
+ processors: List[str] = None,
77
+ **kwargs,
78
+ ) -> List[SayouBlock]:
79
+ """
80
+ [Facade] One-line execution method.
81
+
82
+ Args:
83
+ raw_data (Any): Input data to refine.
84
+ strategy (str): Hint for normalizer selection (default: 'auto').
85
+ **kwargs: Configuration options.
86
+
87
+ Returns:
88
+ List[SayouBlock]: Refined data blocks.
89
+ """
90
+ instance = cls(**kwargs)
91
+ return instance.run(raw_data, strategy, processors, **kwargs)
92
+
93
+ def _register(self, package_name: str):
94
+ """
95
+ Automatically discovers and registers plugins from the specified package.
96
+
97
+ Args:
98
+ package_name (str): The dot-separated package path (e.g., 'sayou.refinery.plugins').
99
+ """
100
+ try:
101
+ package = importlib.import_module(package_name)
102
+ if hasattr(package, "__path__"):
103
+ for _, name, _ in pkgutil.iter_modules(package.__path__):
104
+ full_name = f"{package_name}.{name}"
105
+ try:
106
+ importlib.import_module(full_name)
107
+ self._log(f"Discovered module: {full_name}", level="debug")
108
+ except Exception as e:
109
+ self._log(
110
+ f"Failed to import module {full_name}: {e}", level="warning"
111
+ )
112
+ except ImportError as e:
113
+ self._log(f"Package not found: {package_name} ({e})", level="debug")
57
114
 
58
- def _register(self, comps: List[BaseNormalizer]):
59
- for c in comps:
60
- for t in getattr(c, "SUPPORTED_TYPES", []):
61
- self.normalizers[t] = c
115
+ def _load_from_registry(self):
116
+ """
117
+ Populates local component maps from the global registry.
118
+ """
119
+ if "normalizer" in COMPONENT_REGISTRY:
120
+ self.normalizer_cls_map.update(COMPONENT_REGISTRY["normalizer"])
121
+
122
+ if "processor" in COMPONENT_REGISTRY:
123
+ self.processor_cls_map.update(COMPONENT_REGISTRY["processor"])
62
124
 
63
125
  @safe_run(default_return=None)
64
126
  def initialize(self, **kwargs):
@@ -66,46 +128,172 @@ class RefineryPipeline(BaseComponent):
66
128
  Initialize all sub-components (Normalizers and Processors).
67
129
  Passes global configuration (like PII masking rules) down to components.
68
130
  """
69
- for norm in set(self.normalizers.values()):
70
- norm.initialize(**kwargs)
71
-
72
- for proc in self.processors:
73
- proc.initialize(**kwargs)
131
+ """
132
+ Updates global configuration and logs status.
133
+ Actual component instantiation happens lazily during run().
134
+
135
+ Args:
136
+ **kwargs: Updates to the global configuration.
137
+ """
138
+ self.global_config.update(kwargs)
74
139
 
140
+ n_norm = len(self.normalizer_cls_map)
141
+ n_proc = len(self.processor_cls_map)
75
142
  self._log(
76
- f"Refinery initialized with {len(self.processors)} processors in chain."
143
+ f"RefineryPipeline initialized. Available: {n_norm} Normalizers, {n_proc} Processors."
77
144
  )
78
145
 
79
146
  def run(
80
- self, raw_data: Any, source_type: str = "standard_doc"
81
- ) -> List[ContentBlock]:
147
+ self,
148
+ raw_data: Any,
149
+ strategy: str = "auto",
150
+ processors: Optional[List[str]] = None,
151
+ **kwargs,
152
+ ) -> List[SayouBlock]:
82
153
  """
83
- Execute the refinement pipeline.
154
+ Executes the refinement pipeline: Normalize -> Process Chain.
84
155
 
85
156
  Args:
86
- raw_data: The raw input data (dict, html string, db row list, etc.)
87
- source_type: The type of input data (e.g., 'standard_doc', 'html', 'json')
157
+ raw_data (Any): Input data (Document object, dict, string, etc.).
158
+ strategy (str): Hint for normalizer (default: 'auto').
159
+ processors (List[str], optional): List of processor names to execute in order.
160
+ If None, executes all registered processors (or a default set).
161
+ **kwargs: Runtime configuration.
88
162
 
89
163
  Returns:
90
- List[ContentBlock]: A list of clean, normalized blocks.
91
- """
92
- # Step 1: Normalize (Structure Transformation)
93
- normalizer = self.normalizers.get(source_type)
94
- if not normalizer:
95
- supported = list(self.normalizers.keys())
96
- raise RefineryError(
97
- f"Unknown source_type '{source_type}'. Supported: {supported}"
98
- )
164
+ List[SayouBlock]: A list of clean, normalized blocks.
165
+ """
166
+ if raw_data is None:
167
+ return []
168
+
169
+ run_config = {**self.global_config, **kwargs}
170
+
171
+ self._emit("on_start", input_data={"strategy": strategy})
172
+
173
+ # ---------------------------------------------------------
174
+ # Step 1: Normalize (Smart Routing)
175
+ # ---------------------------------------------------------
176
+ normalizer_cls = self._resolve_normalizer(raw_data, strategy)
177
+
178
+ if not normalizer_cls:
179
+ error_msg = f"No suitable normalizer found for strategy='{strategy}'"
180
+ self._emit("on_error", error=Exception(error_msg))
181
+ raise RefineryError(error_msg)
182
+
183
+ # Instantiate Normalizer
184
+ normalizer = normalizer_cls()
185
+
186
+ if hasattr(self, "_callbacks"):
187
+ for cb in self._callbacks:
188
+ normalizer.add_callback(cb)
189
+
190
+ normalizer.initialize(**run_config)
99
191
 
100
192
  try:
193
+ self._log(f"Normalizing with {normalizer.component_name}...")
101
194
  blocks = normalizer.normalize(raw_data)
102
195
  except Exception as e:
103
- self.logger.error(f"Normalization step failed: {e}")
196
+ self._emit("on_error", error=e)
197
+ self._log(f"Normalization failed: {e}", level="error")
104
198
  return []
105
199
 
106
- # Step 2: Process (Content Cleaning)
107
- # Processors modify blocks in-place or return new lists
108
- for processor in self.processors:
109
- blocks = processor.process(blocks)
200
+ # ---------------------------------------------------------
201
+ # Step 2: Process Chain (Dynamic Execution)
202
+ # ---------------------------------------------------------
203
+ chain_names = (
204
+ processors if processors is not None else run_config.get("processors", [])
205
+ )
206
+
207
+ if not chain_names and not processors:
208
+ chain_names = []
209
+
210
+ active_processors = []
211
+
212
+ for name in chain_names:
213
+ proc_cls = self._resolve_processor_by_name(name)
214
+ if proc_cls:
215
+ proc = proc_cls()
216
+ proc.initialize(**run_config)
217
+ active_processors.append(proc)
218
+ else:
219
+ self._log(f"Processor '{name}' not found in registry.", level="warning")
110
220
 
221
+ for proc in active_processors:
222
+ try:
223
+ self._log(f"Running Processor: {proc.component_name}")
224
+ blocks = proc.process(blocks)
225
+ except Exception as e:
226
+ self._log(f"Processor {proc.component_name} failed: {e}", level="error")
227
+
228
+ self._emit("on_finish", result_data={"blocks_count": len(blocks)}, success=True)
111
229
  return blocks
230
+
231
+ def _resolve_normalizer(
232
+ self,
233
+ raw_data: Any,
234
+ strategy: str,
235
+ ) -> Optional[Type[BaseNormalizer]]:
236
+ """
237
+ Selects the best normalizer based on score or explicit type match.
238
+ """
239
+ if strategy in self.normalizer_cls_map:
240
+ return self.normalizer_cls_map[strategy]
241
+
242
+ best_score = 0.0
243
+ best_cls = None
244
+
245
+ obj_type = getattr(raw_data, "type", type(raw_data).__name__)
246
+ content_len = 0
247
+ if hasattr(raw_data, "content"):
248
+ c = raw_data.content
249
+ if hasattr(c, "__len__"):
250
+ content_len = len(c)
251
+ elif isinstance(raw_data, (str, bytes, list, dict)):
252
+ content_len = len(raw_data)
253
+
254
+ log_lines = [f"Scoring for Item (Type: {obj_type}, Len: {content_len}):"]
255
+ if hasattr(raw_data, "content") and isinstance(raw_data.content, str):
256
+ log_lines.append(f"Content Preview: {raw_data.content[:50]}...")
257
+ elif isinstance(raw_data, str):
258
+ log_lines.append(f"Content Preview: {raw_data[:50]}...")
259
+
260
+ for cls in set(self.normalizer_cls_map.values()):
261
+ try:
262
+ score = cls.can_handle(raw_data, strategy)
263
+
264
+ mark = ""
265
+ if score > best_score:
266
+ best_score = score
267
+ best_cls = cls
268
+ mark = "👑"
269
+
270
+ log_lines.append(f" - {cls.__name__}: {score} {mark}")
271
+
272
+ except Exception as e:
273
+ log_lines.append(f" - {cls.__name__}: Error ({e})")
274
+
275
+ self._log("\n".join(log_lines))
276
+
277
+ if best_cls and best_score > 0.0:
278
+ return best_cls
279
+
280
+ self._log(
281
+ "⚠️ No suitable normalizer found (Score 0).",
282
+ level="warning",
283
+ )
284
+ return None
285
+
286
+ def _resolve_processor_by_name(self, name: str) -> Optional[Type[BaseProcessor]]:
287
+ """
288
+ Finds a processor class by its component_name or registry key.
289
+ """
290
+ # 1. Exact Key Match
291
+ if name in self.processor_cls_map:
292
+ return self.processor_cls_map[name]
293
+
294
+ # 2. Component Name Match (Loop search)
295
+ for cls in self.processor_cls_map.values():
296
+ if getattr(cls, "component_name", "") == name:
297
+ return cls
298
+
299
+ return None
@@ -1,10 +1,13 @@
1
1
  import json
2
2
  from typing import List, Set
3
3
 
4
- from ..core.schemas import ContentBlock
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouBlock
6
+
5
7
  from ..interfaces.base_processor import BaseProcessor
6
8
 
7
9
 
10
+ @register_component("processor")
8
11
  class Deduplicator(BaseProcessor):
9
12
  """
10
13
  (Tier 2) Removes duplicate blocks based on content hashing.
@@ -15,18 +18,24 @@ class Deduplicator(BaseProcessor):
15
18
 
16
19
  component_name = "Deduplicator"
17
20
 
18
- def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
21
+ @classmethod
22
+ def can_handle(cls, blocks: list) -> float:
23
+ if isinstance(blocks, list) and len(blocks) > 1:
24
+ return 1.0
25
+ return 0.0
26
+
27
+ def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
19
28
  """
20
29
  Iterate through blocks and remove duplicates.
21
30
 
22
31
  Args:
23
- blocks (List[ContentBlock]): The input list of blocks.
32
+ blocks (List[SayouBlock]): The input list of blocks.
24
33
 
25
34
  Returns:
26
- List[ContentBlock]: A new list with duplicates removed.
35
+ List[SayouBlock]: A new list with duplicates removed.
27
36
  """
28
37
  seen_hashes: Set[int] = set()
29
- unique_blocks: List[ContentBlock] = []
38
+ unique_blocks: List[SayouBlock] = []
30
39
 
31
40
  for block in blocks:
32
41
  # Generate stable hash key
@@ -1,9 +1,12 @@
1
1
  from typing import Any, Dict, List
2
2
 
3
- from ..core.schemas import ContentBlock
3
+ from sayou.core.registry import register_component
4
+ from sayou.core.schemas import SayouBlock
5
+
4
6
  from ..interfaces.base_processor import BaseProcessor
5
7
 
6
8
 
9
+ @register_component("processor")
7
10
  class Imputer(BaseProcessor):
8
11
  """
9
12
  (Tier 2) Fills missing values in 'record' type blocks using defined rules.
@@ -13,6 +16,12 @@ class Imputer(BaseProcessor):
13
16
 
14
17
  component_name = "Imputer"
15
18
 
19
+ @classmethod
20
+ def can_handle(cls, blocks: list) -> float:
21
+ if super().can_handle(blocks) > 0:
22
+ return 0.8
23
+ return 0.0
24
+
16
25
  def initialize(self, imputation_rules: Dict[str, Any] = None, **kwargs):
17
26
  """
18
27
  Set imputation rules.
@@ -26,15 +35,15 @@ class Imputer(BaseProcessor):
26
35
  if not self.rules:
27
36
  self._log("Imputer initialized with no rules.", level="warning")
28
37
 
29
- def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
38
+ def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
30
39
  """
31
40
  Apply imputation rules to record blocks.
32
41
 
33
42
  Args:
34
- blocks (List[ContentBlock]): Input blocks.
43
+ blocks (List[SayouBlock]): Input blocks.
35
44
 
36
45
  Returns:
37
- List[ContentBlock]: Blocks with missing values filled.
46
+ List[SayouBlock]: Blocks with missing values filled.
38
47
  """
39
48
  for block in blocks:
40
49
  if block.type != "record" or not isinstance(block.content, dict):
@@ -1,9 +1,12 @@
1
1
  from typing import Any, Dict, List
2
2
 
3
- from ..core.schemas import ContentBlock
3
+ from sayou.core.registry import register_component
4
+ from sayou.core.schemas import SayouBlock
5
+
4
6
  from ..interfaces.base_processor import BaseProcessor
5
7
 
6
8
 
9
+ @register_component("processor")
7
10
  class OutlierHandler(BaseProcessor):
8
11
  """
9
12
  (Tier 2) Handles numerical outliers in 'record' blocks.
@@ -14,6 +17,10 @@ class OutlierHandler(BaseProcessor):
14
17
 
15
18
  component_name = "OutlierHandler"
16
19
 
20
+ @classmethod
21
+ def can_handle(cls, blocks: list) -> float:
22
+ return 0.8 if super().can_handle(blocks) > 0 else 0.0
23
+
17
24
  def initialize(self, outlier_rules: Dict[str, Dict[str, Any]] = None, **kwargs):
18
25
  """
19
26
  Set outlier handling rules.
@@ -29,15 +36,15 @@ class OutlierHandler(BaseProcessor):
29
36
  """
30
37
  self.rules = outlier_rules or {}
31
38
 
32
- def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
39
+ def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
33
40
  """
34
41
  Check numerical fields against rules and filter/modify blocks.
35
42
 
36
43
  Args:
37
- blocks (List[ContentBlock]): Input blocks.
44
+ blocks (List[SayouBlock]): Input blocks.
38
45
 
39
46
  Returns:
40
- List[ContentBlock]: Filtered or modified list of blocks.
47
+ List[SayouBlock]: Filtered or modified list of blocks.
41
48
  """
42
49
  valid_blocks = []
43
50
 
@@ -1,10 +1,13 @@
1
1
  import re
2
2
  from typing import List
3
3
 
4
- from ..core.schemas import ContentBlock
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouBlock
6
+
5
7
  from ..interfaces.base_processor import BaseProcessor
6
8
 
7
9
 
10
+ @register_component("processor")
8
11
  class PiiMasker(BaseProcessor):
9
12
  """
10
13
  (Tier 2) Masks Personally Identifiable Information (PII) in text blocks.
@@ -15,6 +18,10 @@ class PiiMasker(BaseProcessor):
15
18
 
16
19
  component_name = "PiiMasker"
17
20
 
21
+ @classmethod
22
+ def can_handle(cls, blocks: list) -> float:
23
+ return 1.0 if super().can_handle(blocks) > 0 else 0.0
24
+
18
25
  def initialize(self, mask_email: bool = True, mask_phone: bool = True, **kwargs):
19
26
  """
20
27
  Configure masking targets.
@@ -30,15 +37,15 @@ class PiiMasker(BaseProcessor):
30
37
  # Simple phone regex (customizable)
31
38
  self._phone_re = re.compile(r"\d{3}[-\.\s]??\d{3,4}[-\.\s]??\d{4}")
32
39
 
33
- def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
40
+ def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
34
41
  """
35
42
  Apply masking regex to text content.
36
43
 
37
44
  Args:
38
- blocks (List[ContentBlock]): Input blocks.
45
+ blocks (List[SayouBlock]): Input blocks.
39
46
 
40
47
  Returns:
41
- List[ContentBlock]: Blocks with sensitive info replaced by tokens.
48
+ List[SayouBlock]: Blocks with sensitive info replaced by tokens.
42
49
  """
43
50
  for block in blocks:
44
51
  if block.type not in ["text", "md"] or not isinstance(block.content, str):
@@ -1,10 +1,13 @@
1
1
  import re
2
2
  from typing import List
3
3
 
4
- from ..core.schemas import ContentBlock
4
+ from sayou.core.registry import register_component
5
+ from sayou.core.schemas import SayouBlock
6
+
5
7
  from ..interfaces.base_processor import BaseProcessor
6
8
 
7
9
 
10
+ @register_component("processor")
8
11
  class TextCleaner(BaseProcessor):
9
12
  """
10
13
  (Tier 2) Cleans text content using regex and whitespace normalization.
@@ -14,6 +17,12 @@ class TextCleaner(BaseProcessor):
14
17
 
15
18
  component_name = "TextCleaner"
16
19
 
20
+ @classmethod
21
+ def can_handle(cls, blocks: list) -> float:
22
+ if super().can_handle(blocks) > 0:
23
+ return 1.0
24
+ return 0.0
25
+
17
26
  def initialize(
18
27
  self, patterns: List[str] = None, normalize_space: bool = True, **kwargs
19
28
  ):
@@ -29,15 +38,15 @@ class TextCleaner(BaseProcessor):
29
38
  self.patterns = [re.compile(p) for p in (patterns or [])]
30
39
  self._space_re = re.compile(r"[ \t]+")
31
40
 
32
- def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
41
+ def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
33
42
  """
34
43
  Apply cleaning logic to text blocks.
35
44
 
36
45
  Args:
37
- blocks (List[ContentBlock]): Input blocks.
46
+ blocks (List[SayouBlock]): Input blocks.
38
47
 
39
48
  Returns:
40
- List[ContentBlock]: Cleaned blocks.
49
+ List[SayouBlock]: Cleaned blocks.
41
50
  """
42
51
  for block in blocks:
43
52
  if block.type not in ["text", "md"]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sayou-refinery
3
- Version: 0.1.6
3
+ Version: 0.3.3
4
4
  Summary: Refinery components for the Sayou Data Platform
5
5
  Project-URL: Homepage, https://www.sayouzone.com/
6
6
  Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
@@ -214,7 +214,7 @@ Classifier: Programming Language :: Python :: 3.10
214
214
  Classifier: Programming Language :: Python :: 3.11
215
215
  Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
216
216
  Requires-Python: >=3.9
217
- Requires-Dist: sayou-core~=0.1.2
217
+ Requires-Dist: sayou-core~=0.3.0
218
218
  Description-Content-Type: text/markdown
219
219
 
220
220
  # sayou-refinery
@@ -227,7 +227,7 @@ Description-Content-Type: text/markdown
227
227
 
228
228
  `sayou-refinery` acts as the "Cleaning Plant" in your data pipeline.
229
229
 
230
- It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a standardized stream of **ContentBlocks**, ensuring that downstream components (like Chunkers or LLMs) receive clean, uniform data regardless of the original source format.
230
+ It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a standardized stream of **SayouBlocks**, ensuring that downstream components (like Chunkers or LLMs) receive clean, uniform data regardless of the original source format.
231
231
 
232
232
  ## 💡 Core Philosophy
233
233
 
@@ -235,7 +235,7 @@ It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a s
235
235
 
236
236
  Refinery operates in two distinct stages to guarantee data quality:
237
237
 
238
- 1. **Normalization (Shape Shifting):** Converts complex structures (nested JSON, HTML trees, DB Rows) into a linear list of `ContentBlocks`.
238
+ 1. **Normalization (Shape Shifting):** Converts complex structures (nested JSON, HTML trees, DB Rows) into a linear list of `SayouBlocks`.
239
239
  2. **Processing (Cleaning):** Applies a chain of cleaning agents (Regex, Masking, Deduplication) to improve data hygiene.
240
240
 
241
241
  ## 📦 Installation
@@ -271,8 +271,8 @@ def run_demo():
271
271
  }
272
272
 
273
273
  # 3. Run Pipeline
274
- # source_type: 'standard_doc', 'html', 'json', etc.
275
- blocks = pipeline.run(raw_doc, source_type="standard_doc")
274
+ # strategy: 'standard_doc', 'html', 'json', etc.
275
+ blocks = pipeline.run(raw_doc, strategy="standard_doc")
276
276
 
277
277
  # 4. Result
278
278
  for block in blocks:
@@ -0,0 +1,16 @@
1
+ sayou/refinery/__init__.py,sha256=KjeNn3m72sukzHBqbvMvBz0g1zzHOdBfZjgfdBJ55_E,677
2
+ sayou/refinery/pipeline.py,sha256=KR3B4QM7diYBJVwTP50L8MWykxABZw3oPbFUeo2SkjY,10393
3
+ sayou/refinery/core/exceptions.py,sha256=_WUPH9EJ7y1JXLMtuBS7I63TPd_GztX8GtJj2JQwV1U,545
4
+ sayou/refinery/interfaces/base_normalizer.py,sha256=CpmAEMM73uTJBmsIcyuMn2S6Ozaf74UCzQHIsoR1F4Y,2491
5
+ sayou/refinery/interfaces/base_processor.py,sha256=whZg1LD-gmSr6b2Hnw1LBAlo0eN_Yu_N5CVKSiesie8,2274
6
+ sayou/refinery/normalizer/doc_markdown_normalizer.py,sha256=t3mxTTYlQ7-WO8JwwOZXi6Cow8aiu1WCVtjLQVihagE,12449
7
+ sayou/refinery/normalizer/html_text_normalizer.py,sha256=k7cDOg-KQIPpABME7onAq02XGkrgm-FWsrKnckeiOCY,2659
8
+ sayou/refinery/normalizer/record_normalizer.py,sha256=0otSAKaaf7MnEyObW2CeyX0TxOUwubu9T1ae6qNgT04,2512
9
+ sayou/refinery/processor/deduplicator.py,sha256=LLjQbipIyI0VuUoe1gzyihfpKgULWrmRYBtXntzfTdU,1644
10
+ sayou/refinery/processor/imputer.py,sha256=x2WAQcHUWwzDnybNlL92bRAvkMmj32B059kihgyXvxY,1792
11
+ sayou/refinery/processor/outlier_handler.py,sha256=rMSKcwmSvlE_LnrhyTbVjOku6Q3WKY5_CNhijSCWFBE,2900
12
+ sayou/refinery/processor/pii_masker.py,sha256=MmS6HNA0w4EgjkPSkWlDGZ1QRNk_9qhU94EVc3cuH-8,1933
13
+ sayou/refinery/processor/text_cleaner.py,sha256=nI6QHHWm_szValqUxdyq5i_54ttlUqyvsy0FzNqrTw0,1978
14
+ sayou_refinery-0.3.3.dist-info/METADATA,sha256=k4Akjq3Ht-2aTgcQOFsBp4I7Pga0iZfsNHfuw9OcXaA,16979
15
+ sayou_refinery-0.3.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
16
+ sayou_refinery-0.3.3.dist-info/RECORD,,