sayou-refinery 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sayou/refinery/__init__.py +21 -0
- sayou/refinery/core/exceptions.py +1 -1
- sayou/refinery/interfaces/base_normalizer.py +29 -8
- sayou/refinery/interfaces/base_processor.py +29 -9
- sayou/refinery/normalizer/doc_markdown_normalizer.py +107 -39
- sayou/refinery/normalizer/html_text_normalizer.py +36 -10
- sayou/refinery/normalizer/record_normalizer.py +26 -9
- sayou/refinery/pipeline.py +251 -63
- sayou/refinery/processor/deduplicator.py +14 -5
- sayou/refinery/processor/imputer.py +13 -4
- sayou/refinery/processor/outlier_handler.py +11 -4
- sayou/refinery/processor/pii_masker.py +11 -4
- sayou/refinery/processor/text_cleaner.py +13 -4
- {sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/METADATA +6 -6
- sayou_refinery-0.3.3.dist-info/RECORD +16 -0
- sayou/refinery/core/schemas.py +0 -27
- sayou_refinery-0.1.6.dist-info/RECORD +0 -16
- {sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/WHEEL +0 -0
sayou/refinery/pipeline.py
CHANGED
|
@@ -1,64 +1,126 @@
|
|
|
1
|
-
|
|
1
|
+
import importlib
|
|
2
|
+
import pkgutil
|
|
3
|
+
from typing import Any, Dict, List, Optional, Type
|
|
2
4
|
|
|
3
5
|
from sayou.core.base_component import BaseComponent
|
|
4
6
|
from sayou.core.decorators import safe_run
|
|
7
|
+
from sayou.core.registry import COMPONENT_REGISTRY
|
|
8
|
+
from sayou.core.schemas import SayouBlock
|
|
5
9
|
|
|
6
10
|
from .core.exceptions import RefineryError
|
|
7
|
-
from .core.schemas import ContentBlock
|
|
8
11
|
from .interfaces.base_normalizer import BaseNormalizer
|
|
9
12
|
from .interfaces.base_processor import BaseProcessor
|
|
10
|
-
from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
|
|
11
|
-
from .normalizer.html_text_normalizer import HtmlTextNormalizer
|
|
12
|
-
from .normalizer.record_normalizer import RecordNormalizer
|
|
13
|
-
from .processor.deduplicator import Deduplicator
|
|
14
|
-
from .processor.imputer import Imputer
|
|
15
|
-
from .processor.outlier_handler import OutlierHandler
|
|
16
|
-
from .processor.pii_masker import PiiMasker
|
|
17
|
-
from .processor.text_cleaner import TextCleaner
|
|
18
13
|
|
|
19
14
|
|
|
20
15
|
class RefineryPipeline(BaseComponent):
|
|
21
16
|
"""
|
|
22
|
-
Orchestrates the data refinement process.
|
|
23
|
-
|
|
24
|
-
|
|
17
|
+
Orchestrates the data refinement process via dynamic registry.
|
|
18
|
+
|
|
19
|
+
Workflow:
|
|
20
|
+
1. Normalization: Converts raw input (Document, HTML, JSON) into standard SayouBlocks.
|
|
21
|
+
2. Processing: Applies a chain of processors (Cleaning, Masking, Dedup) to the blocks.
|
|
25
22
|
"""
|
|
26
23
|
|
|
27
24
|
component_name = "RefineryPipeline"
|
|
28
25
|
|
|
29
26
|
def __init__(
|
|
30
27
|
self,
|
|
31
|
-
extra_normalizers: Optional[List[BaseNormalizer]] = None,
|
|
32
|
-
|
|
28
|
+
extra_normalizers: Optional[List[Type[BaseNormalizer]]] = None,
|
|
29
|
+
**kwargs,
|
|
33
30
|
):
|
|
31
|
+
"""
|
|
32
|
+
Initializes the pipeline and discovers available plugins.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
extra_normalizers: Optional list of custom normalizer classes to register.
|
|
36
|
+
**kwargs: Global configuration passed down to components.
|
|
37
|
+
e.g., processors=["cleaner", "pii_masker"]
|
|
38
|
+
"""
|
|
34
39
|
super().__init__()
|
|
35
|
-
self.normalizers: Dict[str, BaseNormalizer] = {}
|
|
36
40
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
41
|
+
self.normalizer_cls_map: Dict[str, Type[BaseNormalizer]] = {}
|
|
42
|
+
self.processor_cls_map: Dict[str, Type[BaseProcessor]] = {}
|
|
43
|
+
|
|
44
|
+
self._register("sayou.refinery.normalizer")
|
|
45
|
+
self._register("sayou.refinery.processor")
|
|
46
|
+
self._register("sayou.refinery.plugins")
|
|
47
|
+
|
|
48
|
+
self._load_from_registry()
|
|
40
49
|
|
|
41
|
-
# 2. Register User Extras
|
|
42
50
|
if extra_normalizers:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
self.
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
51
|
+
for cls in extra_normalizers:
|
|
52
|
+
self._register_manual(cls)
|
|
53
|
+
|
|
54
|
+
self.global_config = kwargs
|
|
55
|
+
|
|
56
|
+
self.initialize(**kwargs)
|
|
57
|
+
|
|
58
|
+
def _register_manual(self, cls):
|
|
59
|
+
"""
|
|
60
|
+
Safely registers a user-provided class.
|
|
61
|
+
"""
|
|
62
|
+
if not isinstance(cls, type):
|
|
63
|
+
raise TypeError(
|
|
64
|
+
f"Invalid normalizer: {cls}. "
|
|
65
|
+
f"Please pass the CLASS itself (e.g., MyNormalizer), not an instance (MyNormalizer())."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
name = getattr(cls, "component_name", cls.__name__)
|
|
69
|
+
self.normalizer_cls_map[name] = cls
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def process(
|
|
73
|
+
cls,
|
|
74
|
+
raw_data: Any,
|
|
75
|
+
strategy: str = "auto",
|
|
76
|
+
processors: List[str] = None,
|
|
77
|
+
**kwargs,
|
|
78
|
+
) -> List[SayouBlock]:
|
|
79
|
+
"""
|
|
80
|
+
[Facade] One-line execution method.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
raw_data (Any): Input data to refine.
|
|
84
|
+
strategy (str): Hint for normalizer selection (default: 'auto').
|
|
85
|
+
**kwargs: Configuration options.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List[SayouBlock]: Refined data blocks.
|
|
89
|
+
"""
|
|
90
|
+
instance = cls(**kwargs)
|
|
91
|
+
return instance.run(raw_data, strategy, processors, **kwargs)
|
|
92
|
+
|
|
93
|
+
def _register(self, package_name: str):
|
|
94
|
+
"""
|
|
95
|
+
Automatically discovers and registers plugins from the specified package.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
package_name (str): The dot-separated package path (e.g., 'sayou.refinery.plugins').
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
package = importlib.import_module(package_name)
|
|
102
|
+
if hasattr(package, "__path__"):
|
|
103
|
+
for _, name, _ in pkgutil.iter_modules(package.__path__):
|
|
104
|
+
full_name = f"{package_name}.{name}"
|
|
105
|
+
try:
|
|
106
|
+
importlib.import_module(full_name)
|
|
107
|
+
self._log(f"Discovered module: {full_name}", level="debug")
|
|
108
|
+
except Exception as e:
|
|
109
|
+
self._log(
|
|
110
|
+
f"Failed to import module {full_name}: {e}", level="warning"
|
|
111
|
+
)
|
|
112
|
+
except ImportError as e:
|
|
113
|
+
self._log(f"Package not found: {package_name} ({e})", level="debug")
|
|
57
114
|
|
|
58
|
-
def
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
115
|
+
def _load_from_registry(self):
|
|
116
|
+
"""
|
|
117
|
+
Populates local component maps from the global registry.
|
|
118
|
+
"""
|
|
119
|
+
if "normalizer" in COMPONENT_REGISTRY:
|
|
120
|
+
self.normalizer_cls_map.update(COMPONENT_REGISTRY["normalizer"])
|
|
121
|
+
|
|
122
|
+
if "processor" in COMPONENT_REGISTRY:
|
|
123
|
+
self.processor_cls_map.update(COMPONENT_REGISTRY["processor"])
|
|
62
124
|
|
|
63
125
|
@safe_run(default_return=None)
|
|
64
126
|
def initialize(self, **kwargs):
|
|
@@ -66,46 +128,172 @@ class RefineryPipeline(BaseComponent):
|
|
|
66
128
|
Initialize all sub-components (Normalizers and Processors).
|
|
67
129
|
Passes global configuration (like PII masking rules) down to components.
|
|
68
130
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
131
|
+
"""
|
|
132
|
+
Updates global configuration and logs status.
|
|
133
|
+
Actual component instantiation happens lazily during run().
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
**kwargs: Updates to the global configuration.
|
|
137
|
+
"""
|
|
138
|
+
self.global_config.update(kwargs)
|
|
74
139
|
|
|
140
|
+
n_norm = len(self.normalizer_cls_map)
|
|
141
|
+
n_proc = len(self.processor_cls_map)
|
|
75
142
|
self._log(
|
|
76
|
-
f"
|
|
143
|
+
f"RefineryPipeline initialized. Available: {n_norm} Normalizers, {n_proc} Processors."
|
|
77
144
|
)
|
|
78
145
|
|
|
79
146
|
def run(
|
|
80
|
-
self,
|
|
81
|
-
|
|
147
|
+
self,
|
|
148
|
+
raw_data: Any,
|
|
149
|
+
strategy: str = "auto",
|
|
150
|
+
processors: Optional[List[str]] = None,
|
|
151
|
+
**kwargs,
|
|
152
|
+
) -> List[SayouBlock]:
|
|
82
153
|
"""
|
|
83
|
-
|
|
154
|
+
Executes the refinement pipeline: Normalize -> Process Chain.
|
|
84
155
|
|
|
85
156
|
Args:
|
|
86
|
-
raw_data:
|
|
87
|
-
|
|
157
|
+
raw_data (Any): Input data (Document object, dict, string, etc.).
|
|
158
|
+
strategy (str): Hint for normalizer (default: 'auto').
|
|
159
|
+
processors (List[str], optional): List of processor names to execute in order.
|
|
160
|
+
If None, executes all registered processors (or a default set).
|
|
161
|
+
**kwargs: Runtime configuration.
|
|
88
162
|
|
|
89
163
|
Returns:
|
|
90
|
-
List[
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
164
|
+
List[SayouBlock]: A list of clean, normalized blocks.
|
|
165
|
+
"""
|
|
166
|
+
if raw_data is None:
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
run_config = {**self.global_config, **kwargs}
|
|
170
|
+
|
|
171
|
+
self._emit("on_start", input_data={"strategy": strategy})
|
|
172
|
+
|
|
173
|
+
# ---------------------------------------------------------
|
|
174
|
+
# Step 1: Normalize (Smart Routing)
|
|
175
|
+
# ---------------------------------------------------------
|
|
176
|
+
normalizer_cls = self._resolve_normalizer(raw_data, strategy)
|
|
177
|
+
|
|
178
|
+
if not normalizer_cls:
|
|
179
|
+
error_msg = f"No suitable normalizer found for strategy='{strategy}'"
|
|
180
|
+
self._emit("on_error", error=Exception(error_msg))
|
|
181
|
+
raise RefineryError(error_msg)
|
|
182
|
+
|
|
183
|
+
# Instantiate Normalizer
|
|
184
|
+
normalizer = normalizer_cls()
|
|
185
|
+
|
|
186
|
+
if hasattr(self, "_callbacks"):
|
|
187
|
+
for cb in self._callbacks:
|
|
188
|
+
normalizer.add_callback(cb)
|
|
189
|
+
|
|
190
|
+
normalizer.initialize(**run_config)
|
|
99
191
|
|
|
100
192
|
try:
|
|
193
|
+
self._log(f"Normalizing with {normalizer.component_name}...")
|
|
101
194
|
blocks = normalizer.normalize(raw_data)
|
|
102
195
|
except Exception as e:
|
|
103
|
-
self.
|
|
196
|
+
self._emit("on_error", error=e)
|
|
197
|
+
self._log(f"Normalization failed: {e}", level="error")
|
|
104
198
|
return []
|
|
105
199
|
|
|
106
|
-
#
|
|
107
|
-
#
|
|
108
|
-
|
|
109
|
-
|
|
200
|
+
# ---------------------------------------------------------
|
|
201
|
+
# Step 2: Process Chain (Dynamic Execution)
|
|
202
|
+
# ---------------------------------------------------------
|
|
203
|
+
chain_names = (
|
|
204
|
+
processors if processors is not None else run_config.get("processors", [])
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if not chain_names and not processors:
|
|
208
|
+
chain_names = []
|
|
209
|
+
|
|
210
|
+
active_processors = []
|
|
211
|
+
|
|
212
|
+
for name in chain_names:
|
|
213
|
+
proc_cls = self._resolve_processor_by_name(name)
|
|
214
|
+
if proc_cls:
|
|
215
|
+
proc = proc_cls()
|
|
216
|
+
proc.initialize(**run_config)
|
|
217
|
+
active_processors.append(proc)
|
|
218
|
+
else:
|
|
219
|
+
self._log(f"Processor '{name}' not found in registry.", level="warning")
|
|
110
220
|
|
|
221
|
+
for proc in active_processors:
|
|
222
|
+
try:
|
|
223
|
+
self._log(f"Running Processor: {proc.component_name}")
|
|
224
|
+
blocks = proc.process(blocks)
|
|
225
|
+
except Exception as e:
|
|
226
|
+
self._log(f"Processor {proc.component_name} failed: {e}", level="error")
|
|
227
|
+
|
|
228
|
+
self._emit("on_finish", result_data={"blocks_count": len(blocks)}, success=True)
|
|
111
229
|
return blocks
|
|
230
|
+
|
|
231
|
+
def _resolve_normalizer(
|
|
232
|
+
self,
|
|
233
|
+
raw_data: Any,
|
|
234
|
+
strategy: str,
|
|
235
|
+
) -> Optional[Type[BaseNormalizer]]:
|
|
236
|
+
"""
|
|
237
|
+
Selects the best normalizer based on score or explicit type match.
|
|
238
|
+
"""
|
|
239
|
+
if strategy in self.normalizer_cls_map:
|
|
240
|
+
return self.normalizer_cls_map[strategy]
|
|
241
|
+
|
|
242
|
+
best_score = 0.0
|
|
243
|
+
best_cls = None
|
|
244
|
+
|
|
245
|
+
obj_type = getattr(raw_data, "type", type(raw_data).__name__)
|
|
246
|
+
content_len = 0
|
|
247
|
+
if hasattr(raw_data, "content"):
|
|
248
|
+
c = raw_data.content
|
|
249
|
+
if hasattr(c, "__len__"):
|
|
250
|
+
content_len = len(c)
|
|
251
|
+
elif isinstance(raw_data, (str, bytes, list, dict)):
|
|
252
|
+
content_len = len(raw_data)
|
|
253
|
+
|
|
254
|
+
log_lines = [f"Scoring for Item (Type: {obj_type}, Len: {content_len}):"]
|
|
255
|
+
if hasattr(raw_data, "content") and isinstance(raw_data.content, str):
|
|
256
|
+
log_lines.append(f"Content Preview: {raw_data.content[:50]}...")
|
|
257
|
+
elif isinstance(raw_data, str):
|
|
258
|
+
log_lines.append(f"Content Preview: {raw_data[:50]}...")
|
|
259
|
+
|
|
260
|
+
for cls in set(self.normalizer_cls_map.values()):
|
|
261
|
+
try:
|
|
262
|
+
score = cls.can_handle(raw_data, strategy)
|
|
263
|
+
|
|
264
|
+
mark = ""
|
|
265
|
+
if score > best_score:
|
|
266
|
+
best_score = score
|
|
267
|
+
best_cls = cls
|
|
268
|
+
mark = "👑"
|
|
269
|
+
|
|
270
|
+
log_lines.append(f" - {cls.__name__}: {score} {mark}")
|
|
271
|
+
|
|
272
|
+
except Exception as e:
|
|
273
|
+
log_lines.append(f" - {cls.__name__}: Error ({e})")
|
|
274
|
+
|
|
275
|
+
self._log("\n".join(log_lines))
|
|
276
|
+
|
|
277
|
+
if best_cls and best_score > 0.0:
|
|
278
|
+
return best_cls
|
|
279
|
+
|
|
280
|
+
self._log(
|
|
281
|
+
"⚠️ No suitable normalizer found (Score 0).",
|
|
282
|
+
level="warning",
|
|
283
|
+
)
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
def _resolve_processor_by_name(self, name: str) -> Optional[Type[BaseProcessor]]:
|
|
287
|
+
"""
|
|
288
|
+
Finds a processor class by its component_name or registry key.
|
|
289
|
+
"""
|
|
290
|
+
# 1. Exact Key Match
|
|
291
|
+
if name in self.processor_cls_map:
|
|
292
|
+
return self.processor_cls_map[name]
|
|
293
|
+
|
|
294
|
+
# 2. Component Name Match (Loop search)
|
|
295
|
+
for cls in self.processor_cls_map.values():
|
|
296
|
+
if getattr(cls, "component_name", "") == name:
|
|
297
|
+
return cls
|
|
298
|
+
|
|
299
|
+
return None
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import List, Set
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouBlock
|
|
6
|
+
|
|
5
7
|
from ..interfaces.base_processor import BaseProcessor
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@register_component("processor")
|
|
8
11
|
class Deduplicator(BaseProcessor):
|
|
9
12
|
"""
|
|
10
13
|
(Tier 2) Removes duplicate blocks based on content hashing.
|
|
@@ -15,18 +18,24 @@ class Deduplicator(BaseProcessor):
|
|
|
15
18
|
|
|
16
19
|
component_name = "Deduplicator"
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
@classmethod
|
|
22
|
+
def can_handle(cls, blocks: list) -> float:
|
|
23
|
+
if isinstance(blocks, list) and len(blocks) > 1:
|
|
24
|
+
return 1.0
|
|
25
|
+
return 0.0
|
|
26
|
+
|
|
27
|
+
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
19
28
|
"""
|
|
20
29
|
Iterate through blocks and remove duplicates.
|
|
21
30
|
|
|
22
31
|
Args:
|
|
23
|
-
blocks (List[
|
|
32
|
+
blocks (List[SayouBlock]): The input list of blocks.
|
|
24
33
|
|
|
25
34
|
Returns:
|
|
26
|
-
List[
|
|
35
|
+
List[SayouBlock]: A new list with duplicates removed.
|
|
27
36
|
"""
|
|
28
37
|
seen_hashes: Set[int] = set()
|
|
29
|
-
unique_blocks: List[
|
|
38
|
+
unique_blocks: List[SayouBlock] = []
|
|
30
39
|
|
|
31
40
|
for block in blocks:
|
|
32
41
|
# Generate stable hash key
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from sayou.core.registry import register_component
|
|
4
|
+
from sayou.core.schemas import SayouBlock
|
|
5
|
+
|
|
4
6
|
from ..interfaces.base_processor import BaseProcessor
|
|
5
7
|
|
|
6
8
|
|
|
9
|
+
@register_component("processor")
|
|
7
10
|
class Imputer(BaseProcessor):
|
|
8
11
|
"""
|
|
9
12
|
(Tier 2) Fills missing values in 'record' type blocks using defined rules.
|
|
@@ -13,6 +16,12 @@ class Imputer(BaseProcessor):
|
|
|
13
16
|
|
|
14
17
|
component_name = "Imputer"
|
|
15
18
|
|
|
19
|
+
@classmethod
|
|
20
|
+
def can_handle(cls, blocks: list) -> float:
|
|
21
|
+
if super().can_handle(blocks) > 0:
|
|
22
|
+
return 0.8
|
|
23
|
+
return 0.0
|
|
24
|
+
|
|
16
25
|
def initialize(self, imputation_rules: Dict[str, Any] = None, **kwargs):
|
|
17
26
|
"""
|
|
18
27
|
Set imputation rules.
|
|
@@ -26,15 +35,15 @@ class Imputer(BaseProcessor):
|
|
|
26
35
|
if not self.rules:
|
|
27
36
|
self._log("Imputer initialized with no rules.", level="warning")
|
|
28
37
|
|
|
29
|
-
def _do_process(self, blocks: List[
|
|
38
|
+
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
30
39
|
"""
|
|
31
40
|
Apply imputation rules to record blocks.
|
|
32
41
|
|
|
33
42
|
Args:
|
|
34
|
-
blocks (List[
|
|
43
|
+
blocks (List[SayouBlock]): Input blocks.
|
|
35
44
|
|
|
36
45
|
Returns:
|
|
37
|
-
List[
|
|
46
|
+
List[SayouBlock]: Blocks with missing values filled.
|
|
38
47
|
"""
|
|
39
48
|
for block in blocks:
|
|
40
49
|
if block.type != "record" or not isinstance(block.content, dict):
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from sayou.core.registry import register_component
|
|
4
|
+
from sayou.core.schemas import SayouBlock
|
|
5
|
+
|
|
4
6
|
from ..interfaces.base_processor import BaseProcessor
|
|
5
7
|
|
|
6
8
|
|
|
9
|
+
@register_component("processor")
|
|
7
10
|
class OutlierHandler(BaseProcessor):
|
|
8
11
|
"""
|
|
9
12
|
(Tier 2) Handles numerical outliers in 'record' blocks.
|
|
@@ -14,6 +17,10 @@ class OutlierHandler(BaseProcessor):
|
|
|
14
17
|
|
|
15
18
|
component_name = "OutlierHandler"
|
|
16
19
|
|
|
20
|
+
@classmethod
|
|
21
|
+
def can_handle(cls, blocks: list) -> float:
|
|
22
|
+
return 0.8 if super().can_handle(blocks) > 0 else 0.0
|
|
23
|
+
|
|
17
24
|
def initialize(self, outlier_rules: Dict[str, Dict[str, Any]] = None, **kwargs):
|
|
18
25
|
"""
|
|
19
26
|
Set outlier handling rules.
|
|
@@ -29,15 +36,15 @@ class OutlierHandler(BaseProcessor):
|
|
|
29
36
|
"""
|
|
30
37
|
self.rules = outlier_rules or {}
|
|
31
38
|
|
|
32
|
-
def _do_process(self, blocks: List[
|
|
39
|
+
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
33
40
|
"""
|
|
34
41
|
Check numerical fields against rules and filter/modify blocks.
|
|
35
42
|
|
|
36
43
|
Args:
|
|
37
|
-
blocks (List[
|
|
44
|
+
blocks (List[SayouBlock]): Input blocks.
|
|
38
45
|
|
|
39
46
|
Returns:
|
|
40
|
-
List[
|
|
47
|
+
List[SayouBlock]: Filtered or modified list of blocks.
|
|
41
48
|
"""
|
|
42
49
|
valid_blocks = []
|
|
43
50
|
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouBlock
|
|
6
|
+
|
|
5
7
|
from ..interfaces.base_processor import BaseProcessor
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@register_component("processor")
|
|
8
11
|
class PiiMasker(BaseProcessor):
|
|
9
12
|
"""
|
|
10
13
|
(Tier 2) Masks Personally Identifiable Information (PII) in text blocks.
|
|
@@ -15,6 +18,10 @@ class PiiMasker(BaseProcessor):
|
|
|
15
18
|
|
|
16
19
|
component_name = "PiiMasker"
|
|
17
20
|
|
|
21
|
+
@classmethod
|
|
22
|
+
def can_handle(cls, blocks: list) -> float:
|
|
23
|
+
return 1.0 if super().can_handle(blocks) > 0 else 0.0
|
|
24
|
+
|
|
18
25
|
def initialize(self, mask_email: bool = True, mask_phone: bool = True, **kwargs):
|
|
19
26
|
"""
|
|
20
27
|
Configure masking targets.
|
|
@@ -30,15 +37,15 @@ class PiiMasker(BaseProcessor):
|
|
|
30
37
|
# Simple phone regex (customizable)
|
|
31
38
|
self._phone_re = re.compile(r"\d{3}[-\.\s]??\d{3,4}[-\.\s]??\d{4}")
|
|
32
39
|
|
|
33
|
-
def _do_process(self, blocks: List[
|
|
40
|
+
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
34
41
|
"""
|
|
35
42
|
Apply masking regex to text content.
|
|
36
43
|
|
|
37
44
|
Args:
|
|
38
|
-
blocks (List[
|
|
45
|
+
blocks (List[SayouBlock]): Input blocks.
|
|
39
46
|
|
|
40
47
|
Returns:
|
|
41
|
-
List[
|
|
48
|
+
List[SayouBlock]: Blocks with sensitive info replaced by tokens.
|
|
42
49
|
"""
|
|
43
50
|
for block in blocks:
|
|
44
51
|
if block.type not in ["text", "md"] or not isinstance(block.content, str):
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from sayou.core.registry import register_component
|
|
5
|
+
from sayou.core.schemas import SayouBlock
|
|
6
|
+
|
|
5
7
|
from ..interfaces.base_processor import BaseProcessor
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@register_component("processor")
|
|
8
11
|
class TextCleaner(BaseProcessor):
|
|
9
12
|
"""
|
|
10
13
|
(Tier 2) Cleans text content using regex and whitespace normalization.
|
|
@@ -14,6 +17,12 @@ class TextCleaner(BaseProcessor):
|
|
|
14
17
|
|
|
15
18
|
component_name = "TextCleaner"
|
|
16
19
|
|
|
20
|
+
@classmethod
|
|
21
|
+
def can_handle(cls, blocks: list) -> float:
|
|
22
|
+
if super().can_handle(blocks) > 0:
|
|
23
|
+
return 1.0
|
|
24
|
+
return 0.0
|
|
25
|
+
|
|
17
26
|
def initialize(
|
|
18
27
|
self, patterns: List[str] = None, normalize_space: bool = True, **kwargs
|
|
19
28
|
):
|
|
@@ -29,15 +38,15 @@ class TextCleaner(BaseProcessor):
|
|
|
29
38
|
self.patterns = [re.compile(p) for p in (patterns or [])]
|
|
30
39
|
self._space_re = re.compile(r"[ \t]+")
|
|
31
40
|
|
|
32
|
-
def _do_process(self, blocks: List[
|
|
41
|
+
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
33
42
|
"""
|
|
34
43
|
Apply cleaning logic to text blocks.
|
|
35
44
|
|
|
36
45
|
Args:
|
|
37
|
-
blocks (List[
|
|
46
|
+
blocks (List[SayouBlock]): Input blocks.
|
|
38
47
|
|
|
39
48
|
Returns:
|
|
40
|
-
List[
|
|
49
|
+
List[SayouBlock]: Cleaned blocks.
|
|
41
50
|
"""
|
|
42
51
|
for block in blocks:
|
|
43
52
|
if block.type not in ["text", "md"]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sayou-refinery
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Refinery components for the Sayou Data Platform
|
|
5
5
|
Project-URL: Homepage, https://www.sayouzone.com/
|
|
6
6
|
Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
|
|
@@ -214,7 +214,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
214
214
|
Classifier: Programming Language :: Python :: 3.11
|
|
215
215
|
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
216
216
|
Requires-Python: >=3.9
|
|
217
|
-
Requires-Dist: sayou-core~=0.
|
|
217
|
+
Requires-Dist: sayou-core~=0.3.0
|
|
218
218
|
Description-Content-Type: text/markdown
|
|
219
219
|
|
|
220
220
|
# sayou-refinery
|
|
@@ -227,7 +227,7 @@ Description-Content-Type: text/markdown
|
|
|
227
227
|
|
|
228
228
|
`sayou-refinery` acts as the "Cleaning Plant" in your data pipeline.
|
|
229
229
|
|
|
230
|
-
It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a standardized stream of **
|
|
230
|
+
It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a standardized stream of **SayouBlocks**, ensuring that downstream components (like Chunkers or LLMs) receive clean, uniform data regardless of the original source format.
|
|
231
231
|
|
|
232
232
|
## 💡 Core Philosophy
|
|
233
233
|
|
|
@@ -235,7 +235,7 @@ It transforms heterogeneous raw data (JSON Documents, HTML, DB Records) into a s
|
|
|
235
235
|
|
|
236
236
|
Refinery operates in two distinct stages to guarantee data quality:
|
|
237
237
|
|
|
238
|
-
1. **Normalization (Shape Shifting):** Converts complex structures (nested JSON, HTML trees, DB Rows) into a linear list of `
|
|
238
|
+
1. **Normalization (Shape Shifting):** Converts complex structures (nested JSON, HTML trees, DB Rows) into a linear list of `SayouBlocks`.
|
|
239
239
|
2. **Processing (Cleaning):** Applies a chain of cleaning agents (Regex, Masking, Deduplication) to improve data hygiene.
|
|
240
240
|
|
|
241
241
|
## 📦 Installation
|
|
@@ -271,8 +271,8 @@ def run_demo():
|
|
|
271
271
|
}
|
|
272
272
|
|
|
273
273
|
# 3. Run Pipeline
|
|
274
|
-
#
|
|
275
|
-
blocks = pipeline.run(raw_doc,
|
|
274
|
+
# strategy: 'standard_doc', 'html', 'json', etc.
|
|
275
|
+
blocks = pipeline.run(raw_doc, strategy="standard_doc")
|
|
276
276
|
|
|
277
277
|
# 4. Result
|
|
278
278
|
for block in blocks:
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
sayou/refinery/__init__.py,sha256=KjeNn3m72sukzHBqbvMvBz0g1zzHOdBfZjgfdBJ55_E,677
|
|
2
|
+
sayou/refinery/pipeline.py,sha256=KR3B4QM7diYBJVwTP50L8MWykxABZw3oPbFUeo2SkjY,10393
|
|
3
|
+
sayou/refinery/core/exceptions.py,sha256=_WUPH9EJ7y1JXLMtuBS7I63TPd_GztX8GtJj2JQwV1U,545
|
|
4
|
+
sayou/refinery/interfaces/base_normalizer.py,sha256=CpmAEMM73uTJBmsIcyuMn2S6Ozaf74UCzQHIsoR1F4Y,2491
|
|
5
|
+
sayou/refinery/interfaces/base_processor.py,sha256=whZg1LD-gmSr6b2Hnw1LBAlo0eN_Yu_N5CVKSiesie8,2274
|
|
6
|
+
sayou/refinery/normalizer/doc_markdown_normalizer.py,sha256=t3mxTTYlQ7-WO8JwwOZXi6Cow8aiu1WCVtjLQVihagE,12449
|
|
7
|
+
sayou/refinery/normalizer/html_text_normalizer.py,sha256=k7cDOg-KQIPpABME7onAq02XGkrgm-FWsrKnckeiOCY,2659
|
|
8
|
+
sayou/refinery/normalizer/record_normalizer.py,sha256=0otSAKaaf7MnEyObW2CeyX0TxOUwubu9T1ae6qNgT04,2512
|
|
9
|
+
sayou/refinery/processor/deduplicator.py,sha256=LLjQbipIyI0VuUoe1gzyihfpKgULWrmRYBtXntzfTdU,1644
|
|
10
|
+
sayou/refinery/processor/imputer.py,sha256=x2WAQcHUWwzDnybNlL92bRAvkMmj32B059kihgyXvxY,1792
|
|
11
|
+
sayou/refinery/processor/outlier_handler.py,sha256=rMSKcwmSvlE_LnrhyTbVjOku6Q3WKY5_CNhijSCWFBE,2900
|
|
12
|
+
sayou/refinery/processor/pii_masker.py,sha256=MmS6HNA0w4EgjkPSkWlDGZ1QRNk_9qhU94EVc3cuH-8,1933
|
|
13
|
+
sayou/refinery/processor/text_cleaner.py,sha256=nI6QHHWm_szValqUxdyq5i_54ttlUqyvsy0FzNqrTw0,1978
|
|
14
|
+
sayou_refinery-0.3.3.dist-info/METADATA,sha256=k4Akjq3Ht-2aTgcQOFsBp4I7Pga0iZfsNHfuw9OcXaA,16979
|
|
15
|
+
sayou_refinery-0.3.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
16
|
+
sayou_refinery-0.3.3.dist-info/RECORD,,
|