sayou-refinery 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/PKG-INFO +4 -4
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/README.md +2 -2
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/examples/quick_start.ipynb +6 -6
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/examples/quick_start.py +52 -23
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/pyproject.toml +2 -2
- sayou_refinery-0.3.1/src/sayou/refinery/__init__.py +21 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/interfaces/base_normalizer.py +21 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/interfaces/base_processor.py +21 -1
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/doc_markdown_normalizer.py +89 -22
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/html_text_normalizer.py +15 -1
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/record_normalizer.py +16 -0
- sayou_refinery-0.3.1/src/sayou/refinery/pipeline.py +284 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/deduplicator.py +8 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/imputer.py +8 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/outlier_handler.py +6 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/pii_masker.py +6 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/text_cleaner.py +8 -0
- sayou_refinery-0.3.1/tests/test_refinery.py +121 -0
- sayou_refinery-0.2.0/src/sayou/refinery/pipeline.py +0 -109
- sayou_refinery-0.2.0/tests/test_refinery.py +0 -83
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/.gitignore +0 -0
- {sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/core/exceptions.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sayou-refinery
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Refinery components for the Sayou Data Platform
|
|
5
5
|
Project-URL: Homepage, https://www.sayouzone.com/
|
|
6
6
|
Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
|
|
@@ -214,7 +214,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
214
214
|
Classifier: Programming Language :: Python :: 3.11
|
|
215
215
|
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
216
216
|
Requires-Python: >=3.9
|
|
217
|
-
Requires-Dist: sayou-core~=0.
|
|
217
|
+
Requires-Dist: sayou-core~=0.3.0
|
|
218
218
|
Description-Content-Type: text/markdown
|
|
219
219
|
|
|
220
220
|
# sayou-refinery
|
|
@@ -271,8 +271,8 @@ def run_demo():
|
|
|
271
271
|
}
|
|
272
272
|
|
|
273
273
|
# 3. Run Pipeline
|
|
274
|
-
#
|
|
275
|
-
blocks = pipeline.run(raw_doc,
|
|
274
|
+
# strategy: 'standard_doc', 'html', 'json', etc.
|
|
275
|
+
blocks = pipeline.run(raw_doc, strategy="standard_doc")
|
|
276
276
|
|
|
277
277
|
# 4. Result
|
|
278
278
|
for block in blocks:
|
|
@@ -52,8 +52,8 @@ def run_demo():
|
|
|
52
52
|
}
|
|
53
53
|
|
|
54
54
|
# 3. Run Pipeline
|
|
55
|
-
#
|
|
56
|
-
blocks = pipeline.run(raw_doc,
|
|
55
|
+
# strategy: 'standard_doc', 'html', 'json', etc.
|
|
56
|
+
blocks = pipeline.run(raw_doc, strategy="standard_doc")
|
|
57
57
|
|
|
58
58
|
# 4. Result
|
|
59
59
|
for block in blocks:
|
|
@@ -114,8 +114,8 @@
|
|
|
114
114
|
"\n",
|
|
115
115
|
"print(\">>> Running Document Normalization...\")\n",
|
|
116
116
|
"\n",
|
|
117
|
-
"#
|
|
118
|
-
"blocks = pipeline.run(raw_doc,
|
|
117
|
+
"# strategy=\"standard_doc\" -> DocMarkdownNormalizer 선택\n",
|
|
118
|
+
"blocks = pipeline.run(raw_doc, strategy=\"standard_doc\")\n",
|
|
119
119
|
"\n",
|
|
120
120
|
"for b in blocks:\n",
|
|
121
121
|
" print(f\"[{b.type}] {b.content}\")"
|
|
@@ -152,8 +152,8 @@
|
|
|
152
152
|
"\n",
|
|
153
153
|
"print(\">>> Running HTML Normalization...\")\n",
|
|
154
154
|
"\n",
|
|
155
|
-
"#
|
|
156
|
-
"html_blocks = pipeline.run(dirty_html,
|
|
155
|
+
"# strategy=\"html\" -> HtmlTextNormalizer 선택\n",
|
|
156
|
+
"html_blocks = pipeline.run(dirty_html, strategy=\"html\")\n",
|
|
157
157
|
"\n",
|
|
158
158
|
"for b in html_blocks:\n",
|
|
159
159
|
" # repr()을 사용하여 공백 처리 확인\n",
|
|
@@ -188,8 +188,8 @@
|
|
|
188
188
|
"\n",
|
|
189
189
|
"print(\">>> Running Record Normalization...\")\n",
|
|
190
190
|
"\n",
|
|
191
|
-
"#
|
|
192
|
-
"record_blocks = pipeline.run(db_rows,
|
|
191
|
+
"# strategy=\"json\" -> RecordNormalizer 선택\n",
|
|
192
|
+
"record_blocks = pipeline.run(db_rows, strategy=\"json\")\n",
|
|
193
193
|
"\n",
|
|
194
194
|
"for b in record_blocks:\n",
|
|
195
195
|
" print(f\"[{b.type}] {b.content}\")"
|
|
@@ -1,25 +1,27 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
3
|
|
|
3
4
|
from sayou.refinery.pipeline import RefineryPipeline
|
|
4
5
|
|
|
5
|
-
logging.basicConfig(level=logging.INFO, format=
|
|
6
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
def run_demo():
|
|
8
10
|
print(">>> Initializing Sayou Refinery...")
|
|
9
|
-
|
|
11
|
+
|
|
10
12
|
# 설정 주입: PII 마스킹 켜기, 결측치 규칙, 이상치 규칙 설정
|
|
11
13
|
pipeline = RefineryPipeline()
|
|
12
14
|
pipeline.initialize(
|
|
13
15
|
mask_email=True,
|
|
14
16
|
imputation_rules={"category": "Unknown"},
|
|
15
|
-
outlier_rules={"price": {"min": 0, "max": 1000, "action": "clamp"}}
|
|
17
|
+
outlier_rules={"price": {"min": 0, "max": 1000, "action": "clamp"}},
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
# ---------------------------------------------------------
|
|
19
21
|
# Scenario 1: Document JSON -> Markdown (with PII Masking)
|
|
20
22
|
# ---------------------------------------------------------
|
|
21
23
|
print("\n=== [1] Document Normalization (Markdown + PII) ===")
|
|
22
|
-
|
|
24
|
+
|
|
23
25
|
# sayou-document가 생성했다고 가정한 더미 데이터
|
|
24
26
|
raw_doc = {
|
|
25
27
|
"metadata": {"title": "User Report", "author": "admin@sayou.ai"},
|
|
@@ -27,35 +29,51 @@ def run_demo():
|
|
|
27
29
|
{
|
|
28
30
|
"elements": [
|
|
29
31
|
{
|
|
30
|
-
"type": "text",
|
|
32
|
+
"type": "text",
|
|
31
33
|
"text": "Contact support at help@sayou.ai or 010-1234-5678.",
|
|
32
|
-
"raw_attributes": {
|
|
34
|
+
"raw_attributes": {
|
|
35
|
+
"semantic_type": "heading",
|
|
36
|
+
"heading_level": 1,
|
|
37
|
+
},
|
|
33
38
|
},
|
|
34
39
|
{
|
|
35
40
|
"type": "text",
|
|
36
41
|
"text": " Duplicate Paragraph. ",
|
|
37
|
-
"raw_attributes": {}
|
|
42
|
+
"raw_attributes": {},
|
|
38
43
|
},
|
|
39
44
|
{
|
|
40
45
|
"type": "text",
|
|
41
|
-
"text": "Duplicate Paragraph.",
|
|
42
|
-
"raw_attributes": {}
|
|
43
|
-
}
|
|
46
|
+
"text": "Duplicate Paragraph.",
|
|
47
|
+
"raw_attributes": {},
|
|
48
|
+
},
|
|
44
49
|
]
|
|
45
50
|
}
|
|
46
|
-
]
|
|
51
|
+
],
|
|
47
52
|
}
|
|
53
|
+
# with open(img_path, "r", encoding="utf-8") as f:
|
|
54
|
+
# raw_doc = json.load(f)
|
|
55
|
+
|
|
56
|
+
blocks = pipeline.run(raw_doc)
|
|
57
|
+
|
|
58
|
+
json_ready_blocks = []
|
|
48
59
|
|
|
49
|
-
blocks = pipeline.run(raw_doc, source_type="standard_doc")
|
|
50
|
-
|
|
51
60
|
for b in blocks:
|
|
52
61
|
print(f"[{b.type}] {b.content}")
|
|
62
|
+
if hasattr(b, "model_dump"):
|
|
63
|
+
json_ready_blocks.append(b.model_dump()) # Pydantic v2
|
|
64
|
+
elif hasattr(b, "dict"):
|
|
65
|
+
json_ready_blocks.append(b.dict()) # Pydantic v1
|
|
66
|
+
else:
|
|
67
|
+
json_ready_blocks.append(b.__dict__) # 일반 객체
|
|
68
|
+
|
|
69
|
+
with open("examples/result_demo.json", "w", encoding="utf-8") as f:
|
|
70
|
+
json.dump(json_ready_blocks, f, ensure_ascii=False, indent=4)
|
|
53
71
|
|
|
54
72
|
# ---------------------------------------------------------
|
|
55
73
|
# Scenario 2: Dirty HTML -> Clean Text
|
|
56
74
|
# ---------------------------------------------------------
|
|
57
75
|
print("\n=== [2] HTML Normalization (Tag Removal) ===")
|
|
58
|
-
|
|
76
|
+
|
|
59
77
|
dirty_html = """
|
|
60
78
|
<html>
|
|
61
79
|
<style>body { color: red; }</style>
|
|
@@ -66,8 +84,8 @@ def run_demo():
|
|
|
66
84
|
</body>
|
|
67
85
|
</html>
|
|
68
86
|
"""
|
|
69
|
-
|
|
70
|
-
blocks = pipeline.run(dirty_html,
|
|
87
|
+
|
|
88
|
+
blocks = pipeline.run(dirty_html, strategy="html")
|
|
71
89
|
for b in blocks:
|
|
72
90
|
print(f"[{b.type}] {repr(b.content)}")
|
|
73
91
|
|
|
@@ -75,17 +93,28 @@ def run_demo():
|
|
|
75
93
|
# Scenario 3: DB Records (Imputation & Outlier)
|
|
76
94
|
# ---------------------------------------------------------
|
|
77
95
|
print("\n=== [3] Record Normalization (Data Cleaning) ===")
|
|
78
|
-
|
|
96
|
+
|
|
79
97
|
db_rows = [
|
|
80
98
|
{"id": 1, "item": "Apple", "price": 500, "category": "Fruit"},
|
|
81
|
-
{
|
|
82
|
-
|
|
99
|
+
{
|
|
100
|
+
"id": 2,
|
|
101
|
+
"item": "Banana",
|
|
102
|
+
"price": 1500,
|
|
103
|
+
"category": None,
|
|
104
|
+
}, # 결측치 (-> Unknown)
|
|
105
|
+
{
|
|
106
|
+
"id": 3,
|
|
107
|
+
"item": "Diamond",
|
|
108
|
+
"price": 99999,
|
|
109
|
+
"category": "Gem",
|
|
110
|
+
}, # 이상치 (-> 1000 Clamp)
|
|
83
111
|
]
|
|
84
|
-
|
|
85
|
-
blocks = pipeline.run(db_rows,
|
|
86
|
-
|
|
112
|
+
|
|
113
|
+
blocks = pipeline.run(db_rows, strategy="json")
|
|
114
|
+
|
|
87
115
|
for b in blocks:
|
|
88
116
|
print(f"[{b.type}] {b.content}")
|
|
89
117
|
|
|
118
|
+
|
|
90
119
|
if __name__ == "__main__":
|
|
91
|
-
run_demo()
|
|
120
|
+
run_demo()
|
|
@@ -7,7 +7,7 @@ build-backend = "hatchling.build"
|
|
|
7
7
|
# -----------------
|
|
8
8
|
[project]
|
|
9
9
|
name = "sayou-refinery"
|
|
10
|
-
version = "0.
|
|
10
|
+
version = "0.3.1"
|
|
11
11
|
authors = [
|
|
12
12
|
{ name = "Sayouzone", email = "contact@sayouzone.com" },
|
|
13
13
|
]
|
|
@@ -24,7 +24,7 @@ classifiers = [
|
|
|
24
24
|
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
|
25
25
|
]
|
|
26
26
|
dependencies = [
|
|
27
|
-
"sayou-core ~= 0.
|
|
27
|
+
"sayou-core ~= 0.3.0"
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
# -----------------
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .pipeline import RefineryPipeline
|
|
2
|
+
from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
|
|
3
|
+
from .normalizer.html_text_normalizer import HtmlTextNormalizer
|
|
4
|
+
from .normalizer.record_normalizer import RecordNormalizer
|
|
5
|
+
from .processor.deduplicator import Deduplicator
|
|
6
|
+
from .processor.imputer import Imputer
|
|
7
|
+
from .processor.outlier_handler import OutlierHandler
|
|
8
|
+
from .processor.pii_masker import PiiMasker
|
|
9
|
+
from .processor.text_cleaner import TextCleaner
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"RefineryPipeline",
|
|
13
|
+
"DocMarkdownNormalizer",
|
|
14
|
+
"HtmlTextNormalizer",
|
|
15
|
+
"RecordNormalizer",
|
|
16
|
+
"Deduplicator",
|
|
17
|
+
"Imputer",
|
|
18
|
+
"OutlierHandler",
|
|
19
|
+
"PiiMasker",
|
|
20
|
+
"TextCleaner",
|
|
21
|
+
]
|
{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/interfaces/base_normalizer.py
RENAMED
|
@@ -19,6 +19,20 @@ class BaseNormalizer(BaseComponent):
|
|
|
19
19
|
component_name = "BaseNormalizer"
|
|
20
20
|
SUPPORTED_TYPES = []
|
|
21
21
|
|
|
22
|
+
@classmethod
|
|
23
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
24
|
+
"""
|
|
25
|
+
Determines if this normalizer can handle the raw input data.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
raw_data: The input data (dict, str, Document object, etc.)
|
|
29
|
+
strategy: Explicit type hint from user (e.g. 'html', 'json')
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
float: Confidence score (0.0 to 1.0)
|
|
33
|
+
"""
|
|
34
|
+
return 0.0
|
|
35
|
+
|
|
22
36
|
@measure_time
|
|
23
37
|
def normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
24
38
|
"""
|
|
@@ -33,15 +47,22 @@ class BaseNormalizer(BaseComponent):
|
|
|
33
47
|
Raises:
|
|
34
48
|
NormalizationError: If transformation fails.
|
|
35
49
|
"""
|
|
50
|
+
self._emit("on_start", input_data={"type": type(raw_data).__name__})
|
|
51
|
+
|
|
36
52
|
self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
|
|
53
|
+
|
|
37
54
|
try:
|
|
38
55
|
blocks = self._do_normalize(raw_data)
|
|
56
|
+
|
|
57
|
+
self._emit("on_finish", result_data={"blocks": len(blocks)}, success=True)
|
|
58
|
+
|
|
39
59
|
if not isinstance(blocks, list):
|
|
40
60
|
raise NormalizationError(f"Output must be a list, got {type(blocks)}")
|
|
41
61
|
|
|
42
62
|
return blocks
|
|
43
63
|
|
|
44
64
|
except Exception as e:
|
|
65
|
+
self._emit("on_error", error=e)
|
|
45
66
|
wrapped_error = NormalizationError(
|
|
46
67
|
f"[{self.component_name}] Failed: {str(e)}"
|
|
47
68
|
)
|
{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/interfaces/base_processor.py
RENAMED
|
@@ -18,6 +18,20 @@ class BaseProcessor(BaseComponent):
|
|
|
18
18
|
|
|
19
19
|
component_name = "BaseProcessor"
|
|
20
20
|
|
|
21
|
+
@classmethod
|
|
22
|
+
def can_handle(cls, blocks: List[SayouBlock]) -> float:
|
|
23
|
+
"""
|
|
24
|
+
Processors are usually explicitly chained, but this allows for
|
|
25
|
+
future smart-selection (e.g., auto-detecting PII).
|
|
26
|
+
"""
|
|
27
|
+
if (
|
|
28
|
+
isinstance(blocks, list)
|
|
29
|
+
and len(blocks) > 0
|
|
30
|
+
and isinstance(blocks[0], SayouBlock)
|
|
31
|
+
):
|
|
32
|
+
return 0.5
|
|
33
|
+
return 0.0
|
|
34
|
+
|
|
21
35
|
@measure_time
|
|
22
36
|
def process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
23
37
|
"""
|
|
@@ -32,13 +46,19 @@ class BaseProcessor(BaseComponent):
|
|
|
32
46
|
Raises:
|
|
33
47
|
ProcessingError: If processing logic fails.
|
|
34
48
|
"""
|
|
49
|
+
self._emit("on_start", input_data={"blocks": len(blocks)})
|
|
35
50
|
try:
|
|
36
51
|
if not blocks:
|
|
37
52
|
return []
|
|
38
53
|
|
|
39
|
-
|
|
54
|
+
result = self._do_process(blocks)
|
|
55
|
+
|
|
56
|
+
self._emit("on_finish", result_data={"blocks": len(result)}, success=True)
|
|
57
|
+
|
|
58
|
+
return result
|
|
40
59
|
|
|
41
60
|
except Exception as e:
|
|
61
|
+
self._emit("on_error", error=e)
|
|
42
62
|
wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
|
|
43
63
|
self.logger.error(wrapped_error, exc_info=True)
|
|
44
64
|
raise wrapped_error
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
+
from sayou.core.registry import register_component
|
|
3
4
|
from sayou.core.schemas import SayouBlock
|
|
4
5
|
|
|
5
6
|
from ..core.exceptions import NormalizationError
|
|
6
7
|
from ..interfaces.base_normalizer import BaseNormalizer
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@register_component("normalizer")
|
|
9
11
|
class DocMarkdownNormalizer(BaseNormalizer):
|
|
10
12
|
"""
|
|
11
13
|
(Tier 2) Normalizes a Sayou Document Dictionary into Markdown SayouBlocks.
|
|
@@ -18,6 +20,24 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
18
20
|
component_name = "DocMarkdownNormalizer"
|
|
19
21
|
SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
|
|
20
22
|
|
|
23
|
+
@classmethod
|
|
24
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
25
|
+
if strategy in ["markdown", "standard_doc"]:
|
|
26
|
+
return 1.0
|
|
27
|
+
|
|
28
|
+
if hasattr(raw_data, "doc_type") and hasattr(raw_data, "pages"):
|
|
29
|
+
return 1.0
|
|
30
|
+
|
|
31
|
+
if isinstance(raw_data, str):
|
|
32
|
+
if any(
|
|
33
|
+
line.strip().startswith(("#", "-", "* "))
|
|
34
|
+
for line in raw_data.splitlines()[:10]
|
|
35
|
+
):
|
|
36
|
+
return 0.8
|
|
37
|
+
return 0.1
|
|
38
|
+
|
|
39
|
+
return 0.0
|
|
40
|
+
|
|
21
41
|
def initialize(
|
|
22
42
|
self,
|
|
23
43
|
include_headers: bool = True,
|
|
@@ -49,38 +69,85 @@ class DocMarkdownNormalizer(BaseNormalizer):
|
|
|
49
69
|
Raises:
|
|
50
70
|
NormalizationError: If `raw_data` is not a valid dictionary.
|
|
51
71
|
"""
|
|
52
|
-
|
|
72
|
+
# 1. Input Handling (Dict/Object/Str Safe Conversion)
|
|
73
|
+
if isinstance(raw_data, str):
|
|
74
|
+
return [SayouBlock(type="md", content=raw_data, metadata={})]
|
|
75
|
+
|
|
76
|
+
# Handle Pydantic models or objects safely
|
|
77
|
+
if hasattr(raw_data, "model_dump"):
|
|
78
|
+
doc_data = raw_data.model_dump()
|
|
79
|
+
elif hasattr(raw_data, "dict"):
|
|
80
|
+
doc_data = raw_data.dict()
|
|
81
|
+
elif hasattr(raw_data, "__dict__"):
|
|
82
|
+
doc_data = raw_data.__dict__
|
|
83
|
+
elif isinstance(raw_data, dict):
|
|
84
|
+
doc_data = raw_data
|
|
85
|
+
else:
|
|
53
86
|
raise NormalizationError(
|
|
54
|
-
f"Input must be
|
|
87
|
+
f"Input must be convertible to Dictionary, got {type(raw_data).__name__}"
|
|
55
88
|
)
|
|
56
89
|
|
|
57
|
-
|
|
58
|
-
blocks: List[SayouBlock] = []
|
|
90
|
+
normalized_blocks: List[SayouBlock] = []
|
|
59
91
|
|
|
60
|
-
|
|
61
|
-
blocks.extend(self._handle_doc_metadata(doc_data))
|
|
92
|
+
doc_meta = doc_data.get("metadata", {})
|
|
62
93
|
|
|
63
|
-
|
|
64
|
-
if
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
94
|
+
def sanitize_text(text: str) -> str:
|
|
95
|
+
if not text:
|
|
96
|
+
return ""
|
|
97
|
+
text = text.replace("\x0b", "\n")
|
|
98
|
+
text = text.replace("\r", "\n")
|
|
99
|
+
text = text.replace("\f", "\n")
|
|
100
|
+
return text
|
|
69
101
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
102
|
+
# 2. Iterate Pages
|
|
103
|
+
for page in doc_data.get("pages", []):
|
|
104
|
+
page_content_buffer = []
|
|
105
|
+
page_num = page.get("page_index", 0)
|
|
106
|
+
|
|
107
|
+
# Helper to extract text from elements using existing logic
|
|
108
|
+
def collect_text(elements, is_header=False, is_footer=False):
|
|
109
|
+
if not elements:
|
|
110
|
+
return
|
|
111
|
+
for element in elements:
|
|
112
|
+
sub_blocks = self._handle_element(element, is_header, is_footer)
|
|
113
|
+
for sb in sub_blocks:
|
|
114
|
+
if sb.content and sb.content.strip():
|
|
115
|
+
clean_content = sanitize_text(sb.content.strip())
|
|
116
|
+
page_content_buffer.append(clean_content)
|
|
117
|
+
|
|
118
|
+
# A. Header Elements
|
|
119
|
+
if self.include_headers:
|
|
120
|
+
collect_text(page.get("header_elements", []), is_header=True)
|
|
121
|
+
|
|
122
|
+
# B. Body Elements (Main Content)
|
|
123
|
+
collect_text(page.get("elements", []), is_header=False)
|
|
124
|
+
|
|
125
|
+
# C. Footer Elements
|
|
126
|
+
if self.include_footers:
|
|
127
|
+
collect_text(page.get("footer_elements", []), is_footer=True)
|
|
128
|
+
|
|
129
|
+
# 3. Aggregate: Create ONE Block per Page
|
|
130
|
+
if page_content_buffer:
|
|
131
|
+
full_page_text = "\n\n".join(page_content_buffer)
|
|
132
|
+
|
|
133
|
+
block_meta = doc_meta.copy()
|
|
134
|
+
block_meta.update(
|
|
135
|
+
{
|
|
136
|
+
"page_num": page_num,
|
|
137
|
+
"origin_type": "page_aggregated",
|
|
138
|
+
"source": doc_meta.get("filename", "unknown"),
|
|
139
|
+
}
|
|
73
140
|
)
|
|
74
141
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
self._handle_element(element, is_header=False, is_footer=True)
|
|
142
|
+
normalized_blocks.append(
|
|
143
|
+
SayouBlock(
|
|
144
|
+
type="md",
|
|
145
|
+
content=full_page_text,
|
|
146
|
+
metadata=block_meta,
|
|
81
147
|
)
|
|
148
|
+
)
|
|
82
149
|
|
|
83
|
-
return
|
|
150
|
+
return normalized_blocks
|
|
84
151
|
|
|
85
152
|
def _handle_element(
|
|
86
153
|
self, element: Dict[str, Any], is_header: bool, is_footer: bool
|
{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/html_text_normalizer.py
RENAMED
|
@@ -5,12 +5,14 @@ except ImportError:
|
|
|
5
5
|
|
|
6
6
|
from typing import Any, List
|
|
7
7
|
|
|
8
|
+
from sayou.core.registry import register_component
|
|
8
9
|
from sayou.core.schemas import SayouBlock
|
|
9
10
|
|
|
10
11
|
from ..core.exceptions import NormalizationError
|
|
11
12
|
from ..interfaces.base_normalizer import BaseNormalizer
|
|
12
13
|
|
|
13
14
|
|
|
15
|
+
@register_component("normalizer")
|
|
14
16
|
class HtmlTextNormalizer(BaseNormalizer):
|
|
15
17
|
"""
|
|
16
18
|
(Tier 2) Converts HTML string into a clean Text SayouBlock.
|
|
@@ -22,6 +24,18 @@ class HtmlTextNormalizer(BaseNormalizer):
|
|
|
22
24
|
component_name = "HtmlTextNormalizer"
|
|
23
25
|
SUPPORTED_TYPES = ["html"]
|
|
24
26
|
|
|
27
|
+
@classmethod
|
|
28
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
29
|
+
if strategy in ["html"]:
|
|
30
|
+
return 1.0
|
|
31
|
+
|
|
32
|
+
if isinstance(raw_data, str):
|
|
33
|
+
sample = raw_data[:1000].lower()
|
|
34
|
+
if "<html" in sample or "<body" in sample or "<div" in sample:
|
|
35
|
+
return 0.9
|
|
36
|
+
|
|
37
|
+
return 0.0
|
|
38
|
+
|
|
25
39
|
def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
26
40
|
"""
|
|
27
41
|
Parse HTML and extract text.
|
|
@@ -55,4 +69,4 @@ class HtmlTextNormalizer(BaseNormalizer):
|
|
|
55
69
|
|
|
56
70
|
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
|
57
71
|
|
|
58
|
-
return [SayouBlock(type="text", content=text, metadata={"
|
|
72
|
+
return [SayouBlock(type="text", content=text, metadata={"strategy": "html"})]
|
{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/record_normalizer.py
RENAMED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
+
from sayou.core.registry import register_component
|
|
3
4
|
from sayou.core.schemas import SayouBlock
|
|
4
5
|
|
|
5
6
|
from ..core.exceptions import NormalizationError
|
|
6
7
|
from ..interfaces.base_normalizer import BaseNormalizer
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@register_component("normalizer")
|
|
9
11
|
class RecordNormalizer(BaseNormalizer):
|
|
10
12
|
"""
|
|
11
13
|
(Tier 2) Converts structured data (Dict/List) into 'record' SayouBlocks.
|
|
@@ -17,6 +19,20 @@ class RecordNormalizer(BaseNormalizer):
|
|
|
17
19
|
component_name = "RecordNormalizer"
|
|
18
20
|
SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
|
|
19
21
|
|
|
22
|
+
@classmethod
|
|
23
|
+
def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
|
|
24
|
+
if strategy in ["json", "record", "db", "dict"]:
|
|
25
|
+
return 1.0
|
|
26
|
+
|
|
27
|
+
if isinstance(raw_data, dict):
|
|
28
|
+
return 0.9
|
|
29
|
+
if isinstance(raw_data, list):
|
|
30
|
+
if len(raw_data) > 0 and isinstance(raw_data[0], dict):
|
|
31
|
+
return 0.9
|
|
32
|
+
return 0.1
|
|
33
|
+
|
|
34
|
+
return 0.0
|
|
35
|
+
|
|
20
36
|
def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
|
|
21
37
|
"""
|
|
22
38
|
Convert dict or list of dicts into record blocks.
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import pkgutil
|
|
3
|
+
from typing import Any, Dict, List, Optional, Type
|
|
4
|
+
|
|
5
|
+
from sayou.core.base_component import BaseComponent
|
|
6
|
+
from sayou.core.decorators import safe_run
|
|
7
|
+
from sayou.core.registry import COMPONENT_REGISTRY
|
|
8
|
+
from sayou.core.schemas import SayouBlock
|
|
9
|
+
|
|
10
|
+
from .core.exceptions import RefineryError
|
|
11
|
+
from .interfaces.base_normalizer import BaseNormalizer
|
|
12
|
+
from .interfaces.base_processor import BaseProcessor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RefineryPipeline(BaseComponent):
|
|
16
|
+
"""
|
|
17
|
+
Orchestrates the data refinement process via dynamic registry.
|
|
18
|
+
|
|
19
|
+
Workflow:
|
|
20
|
+
1. Normalization: Converts raw input (Document, HTML, JSON) into standard SayouBlocks.
|
|
21
|
+
2. Processing: Applies a chain of processors (Cleaning, Masking, Dedup) to the blocks.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
component_name = "RefineryPipeline"
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
extra_normalizers: Optional[List[Type[BaseNormalizer]]] = None,
|
|
29
|
+
**kwargs,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Initializes the pipeline and discovers available plugins.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
extra_normalizers: Optional list of custom normalizer classes to register.
|
|
36
|
+
**kwargs: Global configuration passed down to components.
|
|
37
|
+
e.g., processors=["cleaner", "pii_masker"]
|
|
38
|
+
"""
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
self.normalizer_cls_map: Dict[str, Type[BaseNormalizer]] = {}
|
|
42
|
+
self.processor_cls_map: Dict[str, Type[BaseProcessor]] = {}
|
|
43
|
+
|
|
44
|
+
self._register("sayou.refinery.normalizer")
|
|
45
|
+
self._register("sayou.refinery.processor")
|
|
46
|
+
self._register("sayou.refinery.plugins")
|
|
47
|
+
|
|
48
|
+
self._load_from_registry()
|
|
49
|
+
|
|
50
|
+
if extra_normalizers:
|
|
51
|
+
for cls in extra_normalizers:
|
|
52
|
+
self._register_manual(cls)
|
|
53
|
+
|
|
54
|
+
self.global_config = kwargs
|
|
55
|
+
|
|
56
|
+
self.initialize(**kwargs)
|
|
57
|
+
|
|
58
|
+
def _register_manual(self, cls):
|
|
59
|
+
"""
|
|
60
|
+
Safely registers a user-provided class.
|
|
61
|
+
"""
|
|
62
|
+
if not isinstance(cls, type):
|
|
63
|
+
raise TypeError(
|
|
64
|
+
f"Invalid normalizer: {cls}. "
|
|
65
|
+
f"Please pass the CLASS itself (e.g., MyNormalizer), not an instance (MyNormalizer())."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
name = getattr(cls, "component_name", cls.__name__)
|
|
69
|
+
self.normalizer_cls_map[name] = cls
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def process(
|
|
73
|
+
cls,
|
|
74
|
+
raw_data: Any,
|
|
75
|
+
strategy: str = "auto",
|
|
76
|
+
processors: List[str] = None,
|
|
77
|
+
**kwargs,
|
|
78
|
+
) -> List[SayouBlock]:
|
|
79
|
+
"""
|
|
80
|
+
[Facade] One-line execution method.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
raw_data (Any): Input data to refine.
|
|
84
|
+
strategy (str): Hint for normalizer selection (default: 'auto').
|
|
85
|
+
**kwargs: Configuration options.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List[SayouBlock]: Refined data blocks.
|
|
89
|
+
"""
|
|
90
|
+
instance = cls(**kwargs)
|
|
91
|
+
return instance.run(raw_data, strategy, processors, **kwargs)
|
|
92
|
+
|
|
93
|
+
def _register(self, package_name: str):
|
|
94
|
+
"""
|
|
95
|
+
Automatically discovers and registers plugins from the specified package.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
package_name (str): The dot-separated package path (e.g., 'sayou.refinery.plugins').
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
package = importlib.import_module(package_name)
|
|
102
|
+
if hasattr(package, "__path__"):
|
|
103
|
+
for _, name, _ in pkgutil.iter_modules(package.__path__):
|
|
104
|
+
full_name = f"{package_name}.{name}"
|
|
105
|
+
try:
|
|
106
|
+
importlib.import_module(full_name)
|
|
107
|
+
self._log(f"Discovered module: {full_name}", level="debug")
|
|
108
|
+
except Exception as e:
|
|
109
|
+
self._log(
|
|
110
|
+
f"Failed to import module {full_name}: {e}", level="warning"
|
|
111
|
+
)
|
|
112
|
+
except ImportError as e:
|
|
113
|
+
self._log(f"Package not found: {package_name} ({e})", level="debug")
|
|
114
|
+
|
|
115
|
+
def _load_from_registry(self):
|
|
116
|
+
"""
|
|
117
|
+
Populates local component maps from the global registry.
|
|
118
|
+
"""
|
|
119
|
+
if "normalizer" in COMPONENT_REGISTRY:
|
|
120
|
+
self.normalizer_cls_map.update(COMPONENT_REGISTRY["normalizer"])
|
|
121
|
+
|
|
122
|
+
if "processor" in COMPONENT_REGISTRY:
|
|
123
|
+
self.processor_cls_map.update(COMPONENT_REGISTRY["processor"])
|
|
124
|
+
|
|
125
|
+
@safe_run(default_return=None)
|
|
126
|
+
def initialize(self, **kwargs):
|
|
127
|
+
"""
|
|
128
|
+
Initialize all sub-components (Normalizers and Processors).
|
|
129
|
+
Passes global configuration (like PII masking rules) down to components.
|
|
130
|
+
"""
|
|
131
|
+
"""
|
|
132
|
+
Updates global configuration and logs status.
|
|
133
|
+
Actual component instantiation happens lazily during run().
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
**kwargs: Updates to the global configuration.
|
|
137
|
+
"""
|
|
138
|
+
self.global_config.update(kwargs)
|
|
139
|
+
|
|
140
|
+
n_norm = len(self.normalizer_cls_map)
|
|
141
|
+
n_proc = len(self.processor_cls_map)
|
|
142
|
+
self._log(
|
|
143
|
+
f"RefineryPipeline initialized. Available: {n_norm} Normalizers, {n_proc} Processors."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def run(
|
|
147
|
+
self,
|
|
148
|
+
raw_data: Any,
|
|
149
|
+
strategy: str = "auto",
|
|
150
|
+
processors: Optional[List[str]] = None,
|
|
151
|
+
**kwargs,
|
|
152
|
+
) -> List[SayouBlock]:
|
|
153
|
+
"""
|
|
154
|
+
Executes the refinement pipeline: Normalize -> Process Chain.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
raw_data (Any): Input data (Document object, dict, string, etc.).
|
|
158
|
+
strategy (str): Hint for normalizer (default: 'auto').
|
|
159
|
+
processors (List[str], optional): List of processor names to execute in order.
|
|
160
|
+
If None, executes all registered processors (or a default set).
|
|
161
|
+
**kwargs: Runtime configuration.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List[SayouBlock]: A list of clean, normalized blocks.
|
|
165
|
+
"""
|
|
166
|
+
if raw_data is None:
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
run_config = {**self.global_config, **kwargs}
|
|
170
|
+
|
|
171
|
+
self._emit("on_start", input_data={"strategy": strategy})
|
|
172
|
+
|
|
173
|
+
# ---------------------------------------------------------
|
|
174
|
+
# Step 1: Normalize (Smart Routing)
|
|
175
|
+
# ---------------------------------------------------------
|
|
176
|
+
normalizer_cls = self._resolve_normalizer(raw_data, strategy)
|
|
177
|
+
|
|
178
|
+
if not normalizer_cls:
|
|
179
|
+
error_msg = f"No suitable normalizer found for strategy='{strategy}'"
|
|
180
|
+
self._emit("on_error", error=Exception(error_msg))
|
|
181
|
+
raise RefineryError(error_msg)
|
|
182
|
+
|
|
183
|
+
# Instantiate Normalizer
|
|
184
|
+
normalizer = normalizer_cls()
|
|
185
|
+
normalizer.initialize(**run_config)
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
self._log(f"Normalizing with {normalizer.component_name}...")
|
|
189
|
+
blocks = normalizer.normalize(raw_data)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self._emit("on_error", error=e)
|
|
192
|
+
self._log(f"Normalization failed: {e}", level="error")
|
|
193
|
+
return []
|
|
194
|
+
|
|
195
|
+
# ---------------------------------------------------------
|
|
196
|
+
# Step 2: Process Chain (Dynamic Execution)
|
|
197
|
+
# ---------------------------------------------------------
|
|
198
|
+
chain_names = (
|
|
199
|
+
processors if processors is not None else run_config.get("processors", [])
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if not chain_names and not processors:
|
|
203
|
+
chain_names = []
|
|
204
|
+
|
|
205
|
+
active_processors = []
|
|
206
|
+
|
|
207
|
+
for name in chain_names:
|
|
208
|
+
proc_cls = self._resolve_processor_by_name(name)
|
|
209
|
+
if proc_cls:
|
|
210
|
+
proc = proc_cls()
|
|
211
|
+
proc.initialize(**run_config)
|
|
212
|
+
active_processors.append(proc)
|
|
213
|
+
else:
|
|
214
|
+
self._log(f"Processor '{name}' not found in registry.", level="warning")
|
|
215
|
+
|
|
216
|
+
for proc in active_processors:
|
|
217
|
+
try:
|
|
218
|
+
self._log(f"Running Processor: {proc.component_name}")
|
|
219
|
+
blocks = proc.process(blocks)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
self._log(f"Processor {proc.component_name} failed: {e}", level="error")
|
|
222
|
+
|
|
223
|
+
self._emit("on_finish", result_data={"blocks_count": len(blocks)}, success=True)
|
|
224
|
+
return blocks
|
|
225
|
+
|
|
226
|
+
def _resolve_normalizer(
|
|
227
|
+
self,
|
|
228
|
+
raw_data: Any,
|
|
229
|
+
strategy: str,
|
|
230
|
+
) -> Optional[Type[BaseNormalizer]]:
|
|
231
|
+
"""
|
|
232
|
+
Selects the best normalizer based on score or explicit type match.
|
|
233
|
+
"""
|
|
234
|
+
if strategy in self.normalizer_cls_map:
|
|
235
|
+
return self.normalizer_cls_map[strategy]
|
|
236
|
+
|
|
237
|
+
best_score = 0.0
|
|
238
|
+
best_cls = None
|
|
239
|
+
|
|
240
|
+
log_lines = [
|
|
241
|
+
f"Scoring for Item (Type: {raw_data.type}, Len: {len(raw_data.content)}):",
|
|
242
|
+
f"Content: {raw_data.content[:30]}",
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
for cls in set(self.normalizer_cls_map.values()):
|
|
246
|
+
try:
|
|
247
|
+
score = cls.can_handle(raw_data, strategy)
|
|
248
|
+
|
|
249
|
+
mark = ""
|
|
250
|
+
if score > best_score:
|
|
251
|
+
best_score = score
|
|
252
|
+
best_cls = cls
|
|
253
|
+
mark = "👑"
|
|
254
|
+
|
|
255
|
+
log_lines.append(f" - {cls.__name__}: {score} {mark}")
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
log_lines.append(f" - {cls.__name__}: Error ({e})")
|
|
259
|
+
|
|
260
|
+
self._log("\n".join(log_lines))
|
|
261
|
+
|
|
262
|
+
if best_cls and best_score > 0.0:
|
|
263
|
+
return best_cls
|
|
264
|
+
|
|
265
|
+
self._log(
|
|
266
|
+
"⚠️ No suitable normalizer found (Score 0).",
|
|
267
|
+
level="warning",
|
|
268
|
+
)
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
def _resolve_processor_by_name(self, name: str) -> Optional[Type[BaseProcessor]]:
|
|
272
|
+
"""
|
|
273
|
+
Finds a processor class by its component_name or registry key.
|
|
274
|
+
"""
|
|
275
|
+
# 1. Exact Key Match
|
|
276
|
+
if name in self.processor_cls_map:
|
|
277
|
+
return self.processor_cls_map[name]
|
|
278
|
+
|
|
279
|
+
# 2. Component Name Match (Loop search)
|
|
280
|
+
for cls in self.processor_cls_map.values():
|
|
281
|
+
if getattr(cls, "component_name", "") == name:
|
|
282
|
+
return cls
|
|
283
|
+
|
|
284
|
+
return None
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import List, Set
|
|
3
3
|
|
|
4
|
+
from sayou.core.registry import register_component
|
|
4
5
|
from sayou.core.schemas import SayouBlock
|
|
5
6
|
|
|
6
7
|
from ..interfaces.base_processor import BaseProcessor
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@register_component("processor")
|
|
9
11
|
class Deduplicator(BaseProcessor):
|
|
10
12
|
"""
|
|
11
13
|
(Tier 2) Removes duplicate blocks based on content hashing.
|
|
@@ -16,6 +18,12 @@ class Deduplicator(BaseProcessor):
|
|
|
16
18
|
|
|
17
19
|
component_name = "Deduplicator"
|
|
18
20
|
|
|
21
|
+
@classmethod
|
|
22
|
+
def can_handle(cls, blocks: list) -> float:
|
|
23
|
+
if isinstance(blocks, list) and len(blocks) > 1:
|
|
24
|
+
return 1.0
|
|
25
|
+
return 0.0
|
|
26
|
+
|
|
19
27
|
def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
|
|
20
28
|
"""
|
|
21
29
|
Iterate through blocks and remove duplicates.
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
+
from sayou.core.registry import register_component
|
|
3
4
|
from sayou.core.schemas import SayouBlock
|
|
4
5
|
|
|
5
6
|
from ..interfaces.base_processor import BaseProcessor
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
@register_component("processor")
|
|
8
10
|
class Imputer(BaseProcessor):
|
|
9
11
|
"""
|
|
10
12
|
(Tier 2) Fills missing values in 'record' type blocks using defined rules.
|
|
@@ -14,6 +16,12 @@ class Imputer(BaseProcessor):
|
|
|
14
16
|
|
|
15
17
|
component_name = "Imputer"
|
|
16
18
|
|
|
19
|
+
@classmethod
|
|
20
|
+
def can_handle(cls, blocks: list) -> float:
|
|
21
|
+
if super().can_handle(blocks) > 0:
|
|
22
|
+
return 0.8
|
|
23
|
+
return 0.0
|
|
24
|
+
|
|
17
25
|
def initialize(self, imputation_rules: Dict[str, Any] = None, **kwargs):
|
|
18
26
|
"""
|
|
19
27
|
Set imputation rules.
|
{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/outlier_handler.py
RENAMED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from typing import Any, Dict, List
|
|
2
2
|
|
|
3
|
+
from sayou.core.registry import register_component
|
|
3
4
|
from sayou.core.schemas import SayouBlock
|
|
4
5
|
|
|
5
6
|
from ..interfaces.base_processor import BaseProcessor
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
@register_component("processor")
|
|
8
10
|
class OutlierHandler(BaseProcessor):
|
|
9
11
|
"""
|
|
10
12
|
(Tier 2) Handles numerical outliers in 'record' blocks.
|
|
@@ -15,6 +17,10 @@ class OutlierHandler(BaseProcessor):
|
|
|
15
17
|
|
|
16
18
|
component_name = "OutlierHandler"
|
|
17
19
|
|
|
20
|
+
@classmethod
|
|
21
|
+
def can_handle(cls, blocks: list) -> float:
|
|
22
|
+
return 0.8 if super().can_handle(blocks) > 0 else 0.0
|
|
23
|
+
|
|
18
24
|
def initialize(self, outlier_rules: Dict[str, Dict[str, Any]] = None, **kwargs):
|
|
19
25
|
"""
|
|
20
26
|
Set outlier handling rules.
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from sayou.core.registry import register_component
|
|
4
5
|
from sayou.core.schemas import SayouBlock
|
|
5
6
|
|
|
6
7
|
from ..interfaces.base_processor import BaseProcessor
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@register_component("processor")
|
|
9
11
|
class PiiMasker(BaseProcessor):
|
|
10
12
|
"""
|
|
11
13
|
(Tier 2) Masks Personally Identifiable Information (PII) in text blocks.
|
|
@@ -16,6 +18,10 @@ class PiiMasker(BaseProcessor):
|
|
|
16
18
|
|
|
17
19
|
component_name = "PiiMasker"
|
|
18
20
|
|
|
21
|
+
@classmethod
|
|
22
|
+
def can_handle(cls, blocks: list) -> float:
|
|
23
|
+
return 1.0 if super().can_handle(blocks) > 0 else 0.0
|
|
24
|
+
|
|
19
25
|
def initialize(self, mask_email: bool = True, mask_phone: bool = True, **kwargs):
|
|
20
26
|
"""
|
|
21
27
|
Configure masking targets.
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from sayou.core.registry import register_component
|
|
4
5
|
from sayou.core.schemas import SayouBlock
|
|
5
6
|
|
|
6
7
|
from ..interfaces.base_processor import BaseProcessor
|
|
7
8
|
|
|
8
9
|
|
|
10
|
+
@register_component("processor")
|
|
9
11
|
class TextCleaner(BaseProcessor):
|
|
10
12
|
"""
|
|
11
13
|
(Tier 2) Cleans text content using regex and whitespace normalization.
|
|
@@ -15,6 +17,12 @@ class TextCleaner(BaseProcessor):
|
|
|
15
17
|
|
|
16
18
|
component_name = "TextCleaner"
|
|
17
19
|
|
|
20
|
+
@classmethod
|
|
21
|
+
def can_handle(cls, blocks: list) -> float:
|
|
22
|
+
if super().can_handle(blocks) > 0:
|
|
23
|
+
return 1.0
|
|
24
|
+
return 0.0
|
|
25
|
+
|
|
18
26
|
def initialize(
|
|
19
27
|
self, patterns: List[str] = None, normalize_space: bool = True, **kwargs
|
|
20
28
|
):
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from sayou.refinery.pipeline import RefineryPipeline
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestRefineryPipeline(unittest.TestCase):
|
|
7
|
+
|
|
8
|
+
def setUp(self):
|
|
9
|
+
# 테스트 전 레지스트리가 로드되었는지 확인 (혹은 process 내부에서 로드됨)
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
def test_doc_markdown_normalization_with_dict(self):
|
|
13
|
+
"""
|
|
14
|
+
[Normalizer] Document 스키마를 가진 Dict가 Markdown 블록으로 변환되는지 확인 (Duck Typing)
|
|
15
|
+
"""
|
|
16
|
+
# Document 객체 대신 Raw Dictionary 사용 (Refinery의 독립성 검증)
|
|
17
|
+
raw_doc = {
|
|
18
|
+
"doc_type": "pdf",
|
|
19
|
+
"pages": [
|
|
20
|
+
{
|
|
21
|
+
"elements": [
|
|
22
|
+
{
|
|
23
|
+
"type": "text",
|
|
24
|
+
"text": "Contact: test@test.com",
|
|
25
|
+
"meta": {"semantic_type": "heading"},
|
|
26
|
+
}
|
|
27
|
+
]
|
|
28
|
+
}
|
|
29
|
+
],
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# [1-Line Magic] 생성+실행
|
|
33
|
+
# strategy="standard_doc" -> DocMarkdownNormalizer 강제 선택
|
|
34
|
+
blocks = RefineryPipeline.process(raw_doc, strategy="standard_doc")
|
|
35
|
+
|
|
36
|
+
print(f"\n[Test 1] Generated Blocks: {blocks}")
|
|
37
|
+
|
|
38
|
+
self.assertTrue(len(blocks) > 0)
|
|
39
|
+
self.assertEqual(blocks[0].type, "md")
|
|
40
|
+
self.assertIn("Contact:", blocks[0].content)
|
|
41
|
+
|
|
42
|
+
def test_deduplication_processor(self):
|
|
43
|
+
"""
|
|
44
|
+
[Processor] 중복 제거 프로세서가 명시적으로 호출되어 작동하는지 확인
|
|
45
|
+
"""
|
|
46
|
+
raw_doc = {
|
|
47
|
+
"pages": [
|
|
48
|
+
{
|
|
49
|
+
"elements": [
|
|
50
|
+
{"type": "text", "text": "Unique Line"},
|
|
51
|
+
{"type": "text", "text": "Dup Line"},
|
|
52
|
+
{"type": "text", "text": "Dup Line"}, # 중복
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# [Fix] processors=["Deduplicator"] 명시
|
|
59
|
+
blocks = RefineryPipeline.process(
|
|
60
|
+
raw_doc, strategy="standard_doc", processors=["Deduplicator"]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# 3개 입력 -> 2개 출력 (중복 제거 성공)
|
|
64
|
+
self.assertEqual(len(blocks), 2)
|
|
65
|
+
|
|
66
|
+
content_list = [b.content for b in blocks]
|
|
67
|
+
self.assertEqual(content_list.count("Dup Line"), 1)
|
|
68
|
+
|
|
69
|
+
def test_record_processing_chain(self):
|
|
70
|
+
"""
|
|
71
|
+
[Chain] JSON 레코드 -> (결측치 채우기) -> (이상치 제거) 체인 테스트
|
|
72
|
+
"""
|
|
73
|
+
raw_records = [
|
|
74
|
+
{"id": 1, "category": None, "score": 50}, # Imputation 대상
|
|
75
|
+
{"id": 2, "category": "A", "score": 200}, # Outlier 대상 (max: 100)
|
|
76
|
+
{"id": 3, "category": "B", "score": 90}, # 정상
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
# [Config] 런타임 설정 주입
|
|
80
|
+
config = {
|
|
81
|
+
"imputation_rules": {"category": "General"}, # None -> 'General'
|
|
82
|
+
"outlier_rules": {"score": {"max": 100, "action": "drop"}}, # 100 초과 삭제
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# [Execution]
|
|
86
|
+
blocks = RefineryPipeline.process(
|
|
87
|
+
raw_records,
|
|
88
|
+
strategy="json", # RecordNormalizer 선택
|
|
89
|
+
processors=["Imputer", "OutlierHandler"], # 순서대로 실행
|
|
90
|
+
**config, # 설정 주입
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# 3개 입력 -> 2개 출력 (id:2 제거됨)
|
|
94
|
+
self.assertEqual(len(blocks), 2)
|
|
95
|
+
|
|
96
|
+
# Imputer 결과 확인 (None -> General)
|
|
97
|
+
block1_data = blocks[0].content # RecordNormalizer는 dict를 content로 가짐
|
|
98
|
+
self.assertEqual(block1_data["category"], "General")
|
|
99
|
+
|
|
100
|
+
# Outlier 결과 확인 (id:2 없음)
|
|
101
|
+
ids = [b.content["id"] for b in blocks]
|
|
102
|
+
self.assertNotIn(2, ids)
|
|
103
|
+
self.assertIn(3, ids)
|
|
104
|
+
|
|
105
|
+
def test_auto_routing_html(self):
|
|
106
|
+
"""
|
|
107
|
+
[Auto] strategy 미지정 시 HTML 감지 확인
|
|
108
|
+
"""
|
|
109
|
+
raw_html = "<html><body><div>Hello World</div></body></html>"
|
|
110
|
+
|
|
111
|
+
# strategy="auto" (기본값)
|
|
112
|
+
blocks = RefineryPipeline.process(raw_html)
|
|
113
|
+
|
|
114
|
+
# HtmlTextNormalizer가 선택되어 텍스트를 추출했어야 함
|
|
115
|
+
self.assertTrue(len(blocks) > 0)
|
|
116
|
+
self.assertEqual(blocks[0].type, "text")
|
|
117
|
+
self.assertIn("Hello World", blocks[0].content)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
unittest.main()
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Optional
|
|
2
|
-
|
|
3
|
-
from sayou.core.base_component import BaseComponent
|
|
4
|
-
from sayou.core.decorators import safe_run
|
|
5
|
-
from sayou.core.schemas import SayouBlock
|
|
6
|
-
|
|
7
|
-
from .core.exceptions import RefineryError
|
|
8
|
-
from .interfaces.base_normalizer import BaseNormalizer
|
|
9
|
-
from .interfaces.base_processor import BaseProcessor
|
|
10
|
-
from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
|
|
11
|
-
from .normalizer.html_text_normalizer import HtmlTextNormalizer
|
|
12
|
-
from .normalizer.record_normalizer import RecordNormalizer
|
|
13
|
-
from .processor.deduplicator import Deduplicator
|
|
14
|
-
from .processor.imputer import Imputer
|
|
15
|
-
from .processor.outlier_handler import OutlierHandler
|
|
16
|
-
from .processor.pii_masker import PiiMasker
|
|
17
|
-
from .processor.text_cleaner import TextCleaner
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class RefineryPipeline(BaseComponent):
|
|
21
|
-
"""
|
|
22
|
-
Orchestrates the data refinement process.
|
|
23
|
-
1. Selects a Normalizer to convert raw data into standard SayouBlocks.
|
|
24
|
-
2. Runs a chain of Processors to clean and transform the blocks.
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
component_name = "RefineryPipeline"
|
|
28
|
-
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
extra_normalizers: Optional[List[BaseNormalizer]] = None,
|
|
32
|
-
processors: Optional[List[BaseProcessor]] = None,
|
|
33
|
-
):
|
|
34
|
-
super().__init__()
|
|
35
|
-
self.normalizers: Dict[str, BaseNormalizer] = {}
|
|
36
|
-
|
|
37
|
-
# 1. Register Default Normalizers
|
|
38
|
-
defaults = [DocMarkdownNormalizer(), HtmlTextNormalizer(), RecordNormalizer()]
|
|
39
|
-
self._register(defaults)
|
|
40
|
-
|
|
41
|
-
# 2. Register User Extras
|
|
42
|
-
if extra_normalizers:
|
|
43
|
-
self._register(extra_normalizers)
|
|
44
|
-
|
|
45
|
-
# 3. Setup Processors Chain
|
|
46
|
-
self.processors = (
|
|
47
|
-
processors
|
|
48
|
-
if processors is not None
|
|
49
|
-
else [
|
|
50
|
-
TextCleaner(),
|
|
51
|
-
PiiMasker(),
|
|
52
|
-
Deduplicator(),
|
|
53
|
-
Imputer(),
|
|
54
|
-
OutlierHandler(),
|
|
55
|
-
]
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
def _register(self, comps: List[BaseNormalizer]):
|
|
59
|
-
for c in comps:
|
|
60
|
-
for t in getattr(c, "SUPPORTED_TYPES", []):
|
|
61
|
-
self.normalizers[t] = c
|
|
62
|
-
|
|
63
|
-
@safe_run(default_return=None)
|
|
64
|
-
def initialize(self, **kwargs):
|
|
65
|
-
"""
|
|
66
|
-
Initialize all sub-components (Normalizers and Processors).
|
|
67
|
-
Passes global configuration (like PII masking rules) down to components.
|
|
68
|
-
"""
|
|
69
|
-
for norm in set(self.normalizers.values()):
|
|
70
|
-
norm.initialize(**kwargs)
|
|
71
|
-
|
|
72
|
-
for proc in self.processors:
|
|
73
|
-
proc.initialize(**kwargs)
|
|
74
|
-
|
|
75
|
-
self._log(
|
|
76
|
-
f"Refinery initialized with {len(self.processors)} processors in chain."
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
def run(self, raw_data: Any, source_type: str = "standard_doc") -> List[SayouBlock]:
|
|
80
|
-
"""
|
|
81
|
-
Execute the refinement pipeline.
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
raw_data: The raw input data (dict, html string, db row list, etc.)
|
|
85
|
-
source_type: The type of input data (e.g., 'standard_doc', 'html', 'json')
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
List[SayouBlock]: A list of clean, normalized blocks.
|
|
89
|
-
"""
|
|
90
|
-
# Step 1: Normalize (Structure Transformation)
|
|
91
|
-
normalizer = self.normalizers.get(source_type)
|
|
92
|
-
if not normalizer:
|
|
93
|
-
supported = list(self.normalizers.keys())
|
|
94
|
-
raise RefineryError(
|
|
95
|
-
f"Unknown source_type '{source_type}'. Supported: {supported}"
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
try:
|
|
99
|
-
blocks = normalizer.normalize(raw_data)
|
|
100
|
-
except Exception as e:
|
|
101
|
-
self.logger.error(f"Normalization step failed: {e}")
|
|
102
|
-
return []
|
|
103
|
-
|
|
104
|
-
# Step 2: Process (Content Cleaning)
|
|
105
|
-
# Processors modify blocks in-place or return new lists
|
|
106
|
-
for processor in self.processors:
|
|
107
|
-
blocks = processor.process(blocks)
|
|
108
|
-
|
|
109
|
-
return blocks
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
|
|
3
|
-
from sayou.refinery.pipeline import RefineryPipeline
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class TestRefineryPipeline(unittest.TestCase):
|
|
7
|
-
|
|
8
|
-
def setUp(self):
|
|
9
|
-
self.pipeline = RefineryPipeline()
|
|
10
|
-
|
|
11
|
-
self.pipeline.initialize(
|
|
12
|
-
mask_email=True,
|
|
13
|
-
imputation_rules={"tag": "general"},
|
|
14
|
-
outlier_rules={"score": {"max": 100, "action": "drop"}},
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
def test_doc_markdown_normalization(self):
|
|
18
|
-
"""[Normalizer] 문서 딕셔너리가 Markdown 블록으로 잘 변환되고 마스킹되는지 확인"""
|
|
19
|
-
raw_doc = {
|
|
20
|
-
"pages": [
|
|
21
|
-
{
|
|
22
|
-
"elements": [
|
|
23
|
-
{
|
|
24
|
-
"type": "text",
|
|
25
|
-
"text": "Contact: test@test.com",
|
|
26
|
-
"raw_attributes": {"semantic_type": "heading"},
|
|
27
|
-
}
|
|
28
|
-
]
|
|
29
|
-
}
|
|
30
|
-
]
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
blocks = self.pipeline.run(raw_doc, source_type="standard_doc")
|
|
34
|
-
|
|
35
|
-
self.assertEqual(len(blocks), 1)
|
|
36
|
-
self.assertEqual(blocks[0].type, "md")
|
|
37
|
-
self.assertEqual(blocks[0].content, "# Contact: [EMAIL]")
|
|
38
|
-
|
|
39
|
-
def test_deduplication(self):
|
|
40
|
-
"""[Processor] 중복된 텍스트 블록이 제거되는지 확인"""
|
|
41
|
-
raw_doc = {
|
|
42
|
-
"pages": [
|
|
43
|
-
{
|
|
44
|
-
"elements": [
|
|
45
|
-
{"type": "text", "text": "Unique Line Content"},
|
|
46
|
-
{"type": "text", "text": "Duplicate Line Content"},
|
|
47
|
-
{"type": "text", "text": "Duplicate Line Content"},
|
|
48
|
-
]
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
52
|
-
blocks = self.pipeline.run(raw_doc, source_type="standard_doc")
|
|
53
|
-
|
|
54
|
-
# 3개 입력 -> 2개 출력 (중복 제거)
|
|
55
|
-
self.assertEqual(len(blocks), 2)
|
|
56
|
-
|
|
57
|
-
content_list = [b.content for b in blocks]
|
|
58
|
-
self.assertIn("Unique Line Content", content_list)
|
|
59
|
-
self.assertEqual(content_list.count("Duplicate Line Content"), 1)
|
|
60
|
-
|
|
61
|
-
def test_record_processing(self):
|
|
62
|
-
"""[Processor] 레코드의 결측치 채우기와 이상치 제거 확인"""
|
|
63
|
-
raw_records = [
|
|
64
|
-
{"id": 1, "tag": None, "score": 50}, # Imputation 대상
|
|
65
|
-
{"id": 2, "tag": "A", "score": 200}, # Outlier (Drop) 대상 (max: 100)
|
|
66
|
-
{"id": 3, "tag": "B", "score": 90}, # 정상
|
|
67
|
-
]
|
|
68
|
-
|
|
69
|
-
blocks = self.pipeline.run(raw_records, source_type="json")
|
|
70
|
-
|
|
71
|
-
# 3개 입력 -> 2개 출력
|
|
72
|
-
self.assertEqual(len(blocks), 2)
|
|
73
|
-
|
|
74
|
-
block1 = blocks[0].content
|
|
75
|
-
self.assertEqual(block1["tag"], "general")
|
|
76
|
-
|
|
77
|
-
ids = [b.content["id"] for b in blocks]
|
|
78
|
-
self.assertNotIn(2, ids)
|
|
79
|
-
self.assertIn(3, ids)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
if __name__ == "__main__":
|
|
83
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|