PyPI - sayou-refinery - Versions diffs - 0.2.0__tar.gz → 0.3.1__tar.gz - Mend

sayou-refinery 0.2.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sayou-refinery
-Version: 0.2.0
+Version: 0.3.1
 Summary: Refinery components for the Sayou Data Platform
 Project-URL: Homepage, https://www.sayouzone.com/
 Project-URL: Documentation, https://sayouzone.github.io/sayou-fabric/
@@ -214,7 +214,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
 Requires-Python: >=3.9
-Requires-Dist: sayou-core~=0.2.0
+Requires-Dist: sayou-core~=0.3.0
 Description-Content-Type: text/markdown
 # sayou-refinery
@@ -271,8 +271,8 @@ def run_demo():
     }
     # 3. Run Pipeline
-    # source_type: 'standard_doc', 'html', 'json', etc.
-    blocks = pipeline.run(raw_doc, source_type="standard_doc")
+    # strategy: 'standard_doc', 'html', 'json', etc.
+    blocks = pipeline.run(raw_doc, strategy="standard_doc")
     # 4. Result
     for block in blocks:

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/README.md RENAMED Viewed

@@ -52,8 +52,8 @@ def run_demo():
     }
     # 3. Run Pipeline
-    # source_type: 'standard_doc', 'html', 'json', etc.
-    blocks = pipeline.run(raw_doc, source_type="standard_doc")
+    # strategy: 'standard_doc', 'html', 'json', etc.
+    blocks = pipeline.run(raw_doc, strategy="standard_doc")
     # 4. Result
     for block in blocks:

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/examples/quick_start.ipynb RENAMED Viewed

@@ -114,8 +114,8 @@
     "\n",
     "print(\">>> Running Document Normalization...\")\n",
     "\n",
-    "# source_type=\"standard_doc\" -> DocMarkdownNormalizer 선택\n",
-    "blocks = pipeline.run(raw_doc, source_type=\"standard_doc\")\n",
+    "# strategy=\"standard_doc\" -> DocMarkdownNormalizer 선택\n",
+    "blocks = pipeline.run(raw_doc, strategy=\"standard_doc\")\n",
     "\n",
     "for b in blocks:\n",
     "    print(f\"[{b.type}] {b.content}\")"
@@ -152,8 +152,8 @@
     "\n",
     "print(\">>> Running HTML Normalization...\")\n",
     "\n",
-    "# source_type=\"html\" -> HtmlTextNormalizer 선택\n",
-    "html_blocks = pipeline.run(dirty_html, source_type=\"html\")\n",
+    "# strategy=\"html\" -> HtmlTextNormalizer 선택\n",
+    "html_blocks = pipeline.run(dirty_html, strategy=\"html\")\n",
     "\n",
     "for b in html_blocks:\n",
     "    # repr()을 사용하여 공백 처리 확인\n",
@@ -188,8 +188,8 @@
     "\n",
     "print(\">>> Running Record Normalization...\")\n",
     "\n",
-    "# source_type=\"json\" -> RecordNormalizer 선택\n",
-    "record_blocks = pipeline.run(db_rows, source_type=\"json\")\n",
+    "# strategy=\"json\" -> RecordNormalizer 선택\n",
+    "record_blocks = pipeline.run(db_rows, strategy=\"json\")\n",
     "\n",
     "for b in record_blocks:\n",
     "    print(f\"[{b.type}] {b.content}\")"

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/examples/quick_start.py RENAMED Viewed

@@ -1,25 +1,27 @@
+import json
 import logging
 from sayou.refinery.pipeline import RefineryPipeline
-logging.basicConfig(level=logging.INFO, format='%(message)s')
+logging.basicConfig(level=logging.INFO, format="%(message)s")
 def run_demo():
     print(">>> Initializing Sayou Refinery...")
     # 설정 주입: PII 마스킹 켜기, 결측치 규칙, 이상치 규칙 설정
     pipeline = RefineryPipeline()
     pipeline.initialize(
         mask_email=True,
         imputation_rules={"category": "Unknown"},
-        outlier_rules={"price": {"min": 0, "max": 1000, "action": "clamp"}}
+        outlier_rules={"price": {"min": 0, "max": 1000, "action": "clamp"}},
     )
     # ---------------------------------------------------------
     # Scenario 1: Document JSON -> Markdown (with PII Masking)
     # ---------------------------------------------------------
     print("\n=== [1] Document Normalization (Markdown + PII) ===")
     # sayou-document가 생성했다고 가정한 더미 데이터
     raw_doc = {
         "metadata": {"title": "User Report", "author": "admin@sayou.ai"},
@@ -27,35 +29,51 @@ def run_demo():
             {
                 "elements": [
                     {
-                        "type": "text",
+                        "type": "text",
                         "text": "Contact support at help@sayou.ai or 010-1234-5678.",
-                        "raw_attributes": {"semantic_type": "heading", "heading_level": 1}
+                        "raw_attributes": {
+                            "semantic_type": "heading",
+                            "heading_level": 1,
+                        },
                     },
                     {
                         "type": "text",
                         "text": "   Duplicate Paragraph.   ",
-                        "raw_attributes": {}
+                        "raw_attributes": {},
                     },
                     {
                         "type": "text",
-                        "text": "Duplicate Paragraph.",
-                        "raw_attributes": {}
-                    }
+                        "text": "Duplicate Paragraph.",
+                        "raw_attributes": {},
+                    },
                 ]
             }
-        ]
+        ],
     }
+    # with open(img_path, "r", encoding="utf-8") as f:
+    #     raw_doc = json.load(f)
+    blocks = pipeline.run(raw_doc)
+    json_ready_blocks = []
-    blocks = pipeline.run(raw_doc, source_type="standard_doc")
     for b in blocks:
         print(f"[{b.type}] {b.content}")
+        if hasattr(b, "model_dump"):
+            json_ready_blocks.append(b.model_dump())  # Pydantic v2
+        elif hasattr(b, "dict"):
+            json_ready_blocks.append(b.dict())  # Pydantic v1
+        else:
+            json_ready_blocks.append(b.__dict__)  # 일반 객체
+    with open("examples/result_demo.json", "w", encoding="utf-8") as f:
+        json.dump(json_ready_blocks, f, ensure_ascii=False, indent=4)
     # ---------------------------------------------------------
     # Scenario 2: Dirty HTML -> Clean Text
     # ---------------------------------------------------------
     print("\n=== [2] HTML Normalization (Tag Removal) ===")
     dirty_html = """
     <html>
         <style>body { color: red; }</style>
@@ -66,8 +84,8 @@ def run_demo():
         </body>
     </html>
     """
-    blocks = pipeline.run(dirty_html, source_type="html")
+    blocks = pipeline.run(dirty_html, strategy="html")
     for b in blocks:
         print(f"[{b.type}] {repr(b.content)}")
@@ -75,17 +93,28 @@ def run_demo():
     # Scenario 3: DB Records (Imputation & Outlier)
     # ---------------------------------------------------------
     print("\n=== [3] Record Normalization (Data Cleaning) ===")
     db_rows = [
         {"id": 1, "item": "Apple", "price": 500, "category": "Fruit"},
-        {"id": 2, "item": "Banana", "price": 1500, "category": None}, # 결측치 (-> Unknown)
-        {"id": 3, "item": "Diamond", "price": 99999, "category": "Gem"} # 이상치 (-> 1000 Clamp)
+        {
+            "id": 2,
+            "item": "Banana",
+            "price": 1500,
+            "category": None,
+        },  # 결측치 (-> Unknown)
+        {
+            "id": 3,
+            "item": "Diamond",
+            "price": 99999,
+            "category": "Gem",
+        },  # 이상치 (-> 1000 Clamp)
     ]
-    blocks = pipeline.run(db_rows, source_type="json")
+    blocks = pipeline.run(db_rows, strategy="json")
     for b in blocks:
         print(f"[{b.type}] {b.content}")
 if __name__ == "__main__":
-    run_demo()
+    run_demo()

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "hatchling.build"
 # -----------------
 [project]
 name = "sayou-refinery"
-version = "0.2.0"
+version = "0.3.1"
 authors = [
     { name = "Sayouzone", email = "contact@sayouzone.com" },
 ]
@@ -24,7 +24,7 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Application Frameworks",
 ]
 dependencies = [
-    "sayou-core ~= 0.2.0"
+    "sayou-core ~= 0.3.0"
 ]
 # -----------------

sayou_refinery-0.3.1/src/sayou/refinery/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from .pipeline import RefineryPipeline
+from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
+from .normalizer.html_text_normalizer import HtmlTextNormalizer
+from .normalizer.record_normalizer import RecordNormalizer
+from .processor.deduplicator import Deduplicator
+from .processor.imputer import Imputer
+from .processor.outlier_handler import OutlierHandler
+from .processor.pii_masker import PiiMasker
+from .processor.text_cleaner import TextCleaner
+__all__ = [
+    "RefineryPipeline",
+    "DocMarkdownNormalizer",
+    "HtmlTextNormalizer",
+    "RecordNormalizer",
+    "Deduplicator",
+    "Imputer",
+    "OutlierHandler",
+    "PiiMasker",
+    "TextCleaner",
+]

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/interfaces/base_normalizer.py RENAMED Viewed

@@ -19,6 +19,20 @@ class BaseNormalizer(BaseComponent):
     component_name = "BaseNormalizer"
     SUPPORTED_TYPES = []
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        """
+        Determines if this normalizer can handle the raw input data.
+        Args:
+            raw_data: The input data (dict, str, Document object, etc.)
+            strategy: Explicit type hint from user (e.g. 'html', 'json')
+        Returns:
+            float: Confidence score (0.0 to 1.0)
+        """
+        return 0.0
     @measure_time
     def normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
@@ -33,15 +47,22 @@ class BaseNormalizer(BaseComponent):
         Raises:
             NormalizationError: If transformation fails.
         """
+        self._emit("on_start", input_data={"type": type(raw_data).__name__})
         self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
         try:
             blocks = self._do_normalize(raw_data)
+            self._emit("on_finish", result_data={"blocks": len(blocks)}, success=True)
             if not isinstance(blocks, list):
                 raise NormalizationError(f"Output must be a list, got {type(blocks)}")
             return blocks
         except Exception as e:
+            self._emit("on_error", error=e)
             wrapped_error = NormalizationError(
                 f"[{self.component_name}] Failed: {str(e)}"
             )

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/interfaces/base_processor.py RENAMED Viewed

@@ -18,6 +18,20 @@ class BaseProcessor(BaseComponent):
     component_name = "BaseProcessor"
+    @classmethod
+    def can_handle(cls, blocks: List[SayouBlock]) -> float:
+        """
+        Processors are usually explicitly chained, but this allows for
+        future smart-selection (e.g., auto-detecting PII).
+        """
+        if (
+            isinstance(blocks, list)
+            and len(blocks) > 0
+            and isinstance(blocks[0], SayouBlock)
+        ):
+            return 0.5
+        return 0.0
     @measure_time
     def process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
         """
@@ -32,13 +46,19 @@ class BaseProcessor(BaseComponent):
         Raises:
             ProcessingError: If processing logic fails.
         """
+        self._emit("on_start", input_data={"blocks": len(blocks)})
         try:
             if not blocks:
                 return []
-            return self._do_process(blocks)
+            result = self._do_process(blocks)
+            self._emit("on_finish", result_data={"blocks": len(result)}, success=True)
+            return result
         except Exception as e:
+            self._emit("on_error", error=e)
             wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
             self.logger.error(wrapped_error, exc_info=True)
             raise wrapped_error

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/doc_markdown_normalizer.py RENAMED Viewed

@@ -1,11 +1,13 @@
 from typing import Any, Dict, List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
 from ..interfaces.base_normalizer import BaseNormalizer
+@register_component("normalizer")
 class DocMarkdownNormalizer(BaseNormalizer):
     """
     (Tier 2) Normalizes a Sayou Document Dictionary into Markdown SayouBlocks.
@@ -18,6 +20,24 @@ class DocMarkdownNormalizer(BaseNormalizer):
     component_name = "DocMarkdownNormalizer"
     SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        if strategy in ["markdown", "standard_doc"]:
+            return 1.0
+        if hasattr(raw_data, "doc_type") and hasattr(raw_data, "pages"):
+            return 1.0
+        if isinstance(raw_data, str):
+            if any(
+                line.strip().startswith(("#", "-", "* "))
+                for line in raw_data.splitlines()[:10]
+            ):
+                return 0.8
+            return 0.1
+        return 0.0
     def initialize(
         self,
         include_headers: bool = True,
@@ -49,38 +69,85 @@ class DocMarkdownNormalizer(BaseNormalizer):
         Raises:
             NormalizationError: If `raw_data` is not a valid dictionary.
         """
-        if not isinstance(raw_data, dict):
+        # 1. Input Handling (Dict/Object/Str Safe Conversion)
+        if isinstance(raw_data, str):
+            return [SayouBlock(type="md", content=raw_data, metadata={})]
+        # Handle Pydantic models or objects safely
+        if hasattr(raw_data, "model_dump"):
+            doc_data = raw_data.model_dump()
+        elif hasattr(raw_data, "dict"):
+            doc_data = raw_data.dict()
+        elif hasattr(raw_data, "__dict__"):
+            doc_data = raw_data.__dict__
+        elif isinstance(raw_data, dict):
+            doc_data = raw_data
+        else:
             raise NormalizationError(
-                f"Input must be a Dictionary, got {type(raw_data).__name__}"
+                f"Input must be convertible to Dictionary, got {type(raw_data).__name__}"
             )
-        doc_data = raw_data
-        blocks: List[SayouBlock] = []
+        normalized_blocks: List[SayouBlock] = []
-        if "metadata" in doc_data and doc_data["metadata"]:
-            blocks.extend(self._handle_doc_metadata(doc_data))
+        doc_meta = doc_data.get("metadata", {})
-        for page in doc_data.get("pages", []):
-            if self.include_headers and "header_elements" in page:
-                for element in page.get("header_elements", []):
-                    blocks.extend(
-                        self._handle_element(element, is_header=True, is_footer=False)
-                    )
+        def sanitize_text(text: str) -> str:
+            if not text:
+                return ""
+            text = text.replace("\x0b", "\n")
+            text = text.replace("\r", "\n")
+            text = text.replace("\f", "\n")
+            return text
-            for element in page.get("elements", []):
-                blocks.extend(
-                    self._handle_element(element, is_header=False, is_footer=False)
+        # 2. Iterate Pages
+        for page in doc_data.get("pages", []):
+            page_content_buffer = []
+            page_num = page.get("page_index", 0)
+            # Helper to extract text from elements using existing logic
+            def collect_text(elements, is_header=False, is_footer=False):
+                if not elements:
+                    return
+                for element in elements:
+                    sub_blocks = self._handle_element(element, is_header, is_footer)
+                    for sb in sub_blocks:
+                        if sb.content and sb.content.strip():
+                            clean_content = sanitize_text(sb.content.strip())
+                            page_content_buffer.append(clean_content)
+            # A. Header Elements
+            if self.include_headers:
+                collect_text(page.get("header_elements", []), is_header=True)
+            # B. Body Elements (Main Content)
+            collect_text(page.get("elements", []), is_header=False)
+            # C. Footer Elements
+            if self.include_footers:
+                collect_text(page.get("footer_elements", []), is_footer=True)
+            # 3. Aggregate: Create ONE Block per Page
+            if page_content_buffer:
+                full_page_text = "\n\n".join(page_content_buffer)
+                block_meta = doc_meta.copy()
+                block_meta.update(
+                    {
+                        "page_num": page_num,
+                        "origin_type": "page_aggregated",
+                        "source": doc_meta.get("filename", "unknown"),
+                    }
                 )
-            if self.include_footers and "footer_elements" in page:
-                for element in page.get("footer_elements", []):
-                    # T2의 기본 규칙: include_footers가 True여도 _handle_element에서
-                    # is_footer=True 플래그를 보고 무시할 수 있음 (T3가 오버라이드 가능)
-                    blocks.extend(
-                        self._handle_element(element, is_header=False, is_footer=True)
+                normalized_blocks.append(
+                    SayouBlock(
+                        type="md",
+                        content=full_page_text,
+                        metadata=block_meta,
                     )
+                )
-        return blocks
+        return normalized_blocks
     def _handle_element(
         self, element: Dict[str, Any], is_header: bool, is_footer: bool

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/html_text_normalizer.py RENAMED Viewed

@@ -5,12 +5,14 @@ except ImportError:
 from typing import Any, List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
 from ..interfaces.base_normalizer import BaseNormalizer
+@register_component("normalizer")
 class HtmlTextNormalizer(BaseNormalizer):
     """
     (Tier 2) Converts HTML string into a clean Text SayouBlock.
@@ -22,6 +24,18 @@ class HtmlTextNormalizer(BaseNormalizer):
     component_name = "HtmlTextNormalizer"
     SUPPORTED_TYPES = ["html"]
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        if strategy in ["html"]:
+            return 1.0
+        if isinstance(raw_data, str):
+            sample = raw_data[:1000].lower()
+            if "<html" in sample or "<body" in sample or "<div" in sample:
+                return 0.9
+        return 0.0
     def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
         Parse HTML and extract text.
@@ -55,4 +69,4 @@ class HtmlTextNormalizer(BaseNormalizer):
         text = re.sub(r"\n{3,}", "\n\n", text).strip()
-        return [SayouBlock(type="text", content=text, metadata={"source_type": "html"})]
+        return [SayouBlock(type="text", content=text, metadata={"strategy": "html"})]

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/normalizer/record_normalizer.py RENAMED Viewed

@@ -1,11 +1,13 @@
 from typing import Any, Dict, List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
 from ..interfaces.base_normalizer import BaseNormalizer
+@register_component("normalizer")
 class RecordNormalizer(BaseNormalizer):
     """
     (Tier 2) Converts structured data (Dict/List) into 'record' SayouBlocks.
@@ -17,6 +19,20 @@ class RecordNormalizer(BaseNormalizer):
     component_name = "RecordNormalizer"
     SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        if strategy in ["json", "record", "db", "dict"]:
+            return 1.0
+        if isinstance(raw_data, dict):
+            return 0.9
+        if isinstance(raw_data, list):
+            if len(raw_data) > 0 and isinstance(raw_data[0], dict):
+                return 0.9
+            return 0.1
+        return 0.0
     def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
         Convert dict or list of dicts into record blocks.

sayou_refinery-0.3.1/src/sayou/refinery/pipeline.py ADDED Viewed

@@ -0,0 +1,284 @@
+import importlib
+import pkgutil
+from typing import Any, Dict, List, Optional, Type
+from sayou.core.base_component import BaseComponent
+from sayou.core.decorators import safe_run
+from sayou.core.registry import COMPONENT_REGISTRY
+from sayou.core.schemas import SayouBlock
+from .core.exceptions import RefineryError
+from .interfaces.base_normalizer import BaseNormalizer
+from .interfaces.base_processor import BaseProcessor
+class RefineryPipeline(BaseComponent):
+    """
+    Orchestrates the data refinement process via dynamic registry.
+    Workflow:
+    1. Normalization: Converts raw input (Document, HTML, JSON) into standard SayouBlocks.
+    2. Processing: Applies a chain of processors (Cleaning, Masking, Dedup) to the blocks.
+    """
+    component_name = "RefineryPipeline"
+    def __init__(
+        self,
+        extra_normalizers: Optional[List[Type[BaseNormalizer]]] = None,
+        **kwargs,
+    ):
+        """
+        Initializes the pipeline and discovers available plugins.
+        Args:
+            extra_normalizers: Optional list of custom normalizer classes to register.
+            **kwargs: Global configuration passed down to components.
+                    e.g., processors=["cleaner", "pii_masker"]
+        """
+        super().__init__()
+        self.normalizer_cls_map: Dict[str, Type[BaseNormalizer]] = {}
+        self.processor_cls_map: Dict[str, Type[BaseProcessor]] = {}
+        self._register("sayou.refinery.normalizer")
+        self._register("sayou.refinery.processor")
+        self._register("sayou.refinery.plugins")
+        self._load_from_registry()
+        if extra_normalizers:
+            for cls in extra_normalizers:
+                self._register_manual(cls)
+        self.global_config = kwargs
+        self.initialize(**kwargs)
+    def _register_manual(self, cls):
+        """
+        Safely registers a user-provided class.
+        """
+        if not isinstance(cls, type):
+            raise TypeError(
+                f"Invalid normalizer: {cls}. "
+                f"Please pass the CLASS itself (e.g., MyNormalizer), not an instance (MyNormalizer())."
+            )
+        name = getattr(cls, "component_name", cls.__name__)
+        self.normalizer_cls_map[name] = cls
+    @classmethod
+    def process(
+        cls,
+        raw_data: Any,
+        strategy: str = "auto",
+        processors: List[str] = None,
+        **kwargs,
+    ) -> List[SayouBlock]:
+        """
+        [Facade] One-line execution method.
+        Args:
+            raw_data (Any): Input data to refine.
+            strategy (str): Hint for normalizer selection (default: 'auto').
+            **kwargs: Configuration options.
+        Returns:
+            List[SayouBlock]: Refined data blocks.
+        """
+        instance = cls(**kwargs)
+        return instance.run(raw_data, strategy, processors, **kwargs)
+    def _register(self, package_name: str):
+        """
+        Automatically discovers and registers plugins from the specified package.
+        Args:
+            package_name (str): The dot-separated package path (e.g., 'sayou.refinery.plugins').
+        """
+        try:
+            package = importlib.import_module(package_name)
+            if hasattr(package, "__path__"):
+                for _, name, _ in pkgutil.iter_modules(package.__path__):
+                    full_name = f"{package_name}.{name}"
+                    try:
+                        importlib.import_module(full_name)
+                        self._log(f"Discovered module: {full_name}", level="debug")
+                    except Exception as e:
+                        self._log(
+                            f"Failed to import module {full_name}: {e}", level="warning"
+                        )
+        except ImportError as e:
+            self._log(f"Package not found: {package_name} ({e})", level="debug")
+    def _load_from_registry(self):
+        """
+        Populates local component maps from the global registry.
+        """
+        if "normalizer" in COMPONENT_REGISTRY:
+            self.normalizer_cls_map.update(COMPONENT_REGISTRY["normalizer"])
+        if "processor" in COMPONENT_REGISTRY:
+            self.processor_cls_map.update(COMPONENT_REGISTRY["processor"])
+    @safe_run(default_return=None)
+    def initialize(self, **kwargs):
+        """
+        Initialize all sub-components (Normalizers and Processors).
+        Passes global configuration (like PII masking rules) down to components.
+        """
+        """
+        Updates global configuration and logs status.
+        Actual component instantiation happens lazily during run().
+        Args:
+            **kwargs: Updates to the global configuration.
+        """
+        self.global_config.update(kwargs)
+        n_norm = len(self.normalizer_cls_map)
+        n_proc = len(self.processor_cls_map)
+        self._log(
+            f"RefineryPipeline initialized. Available: {n_norm} Normalizers, {n_proc} Processors."
+        )
+    def run(
+        self,
+        raw_data: Any,
+        strategy: str = "auto",
+        processors: Optional[List[str]] = None,
+        **kwargs,
+    ) -> List[SayouBlock]:
+        """
+        Executes the refinement pipeline: Normalize -> Process Chain.
+        Args:
+            raw_data (Any): Input data (Document object, dict, string, etc.).
+            strategy (str): Hint for normalizer (default: 'auto').
+            processors (List[str], optional): List of processor names to execute in order.
+                                            If None, executes all registered processors (or a default set).
+            **kwargs: Runtime configuration.
+        Returns:
+            List[SayouBlock]: A list of clean, normalized blocks.
+        """
+        if raw_data is None:
+            return []
+        run_config = {**self.global_config, **kwargs}
+        self._emit("on_start", input_data={"strategy": strategy})
+        # ---------------------------------------------------------
+        # Step 1: Normalize (Smart Routing)
+        # ---------------------------------------------------------
+        normalizer_cls = self._resolve_normalizer(raw_data, strategy)
+        if not normalizer_cls:
+            error_msg = f"No suitable normalizer found for strategy='{strategy}'"
+            self._emit("on_error", error=Exception(error_msg))
+            raise RefineryError(error_msg)
+        # Instantiate Normalizer
+        normalizer = normalizer_cls()
+        normalizer.initialize(**run_config)
+        try:
+            self._log(f"Normalizing with {normalizer.component_name}...")
+            blocks = normalizer.normalize(raw_data)
+        except Exception as e:
+            self._emit("on_error", error=e)
+            self._log(f"Normalization failed: {e}", level="error")
+            return []
+        # ---------------------------------------------------------
+        # Step 2: Process Chain (Dynamic Execution)
+        # ---------------------------------------------------------
+        chain_names = (
+            processors if processors is not None else run_config.get("processors", [])
+        )
+        if not chain_names and not processors:
+            chain_names = []
+        active_processors = []
+        for name in chain_names:
+            proc_cls = self._resolve_processor_by_name(name)
+            if proc_cls:
+                proc = proc_cls()
+                proc.initialize(**run_config)
+                active_processors.append(proc)
+            else:
+                self._log(f"Processor '{name}' not found in registry.", level="warning")
+        for proc in active_processors:
+            try:
+                self._log(f"Running Processor: {proc.component_name}")
+                blocks = proc.process(blocks)
+            except Exception as e:
+                self._log(f"Processor {proc.component_name} failed: {e}", level="error")
+        self._emit("on_finish", result_data={"blocks_count": len(blocks)}, success=True)
+        return blocks
+    def _resolve_normalizer(
+        self,
+        raw_data: Any,
+        strategy: str,
+    ) -> Optional[Type[BaseNormalizer]]:
+        """
+        Selects the best normalizer based on score or explicit type match.
+        """
+        if strategy in self.normalizer_cls_map:
+            return self.normalizer_cls_map[strategy]
+        best_score = 0.0
+        best_cls = None
+        log_lines = [
+            f"Scoring for Item (Type: {raw_data.type}, Len: {len(raw_data.content)}):",
+            f"Content: {raw_data.content[:30]}",
+        ]
+        for cls in set(self.normalizer_cls_map.values()):
+            try:
+                score = cls.can_handle(raw_data, strategy)
+                mark = ""
+                if score > best_score:
+                    best_score = score
+                    best_cls = cls
+                    mark = "👑"
+                log_lines.append(f"   - {cls.__name__}: {score} {mark}")
+            except Exception as e:
+                log_lines.append(f"   - {cls.__name__}: Error ({e})")
+        self._log("\n".join(log_lines))
+        if best_cls and best_score > 0.0:
+            return best_cls
+        self._log(
+            "⚠️ No suitable normalizer found (Score 0).",
+            level="warning",
+        )
+        return None
+    def _resolve_processor_by_name(self, name: str) -> Optional[Type[BaseProcessor]]:
+        """
+        Finds a processor class by its component_name or registry key.
+        """
+        # 1. Exact Key Match
+        if name in self.processor_cls_map:
+            return self.processor_cls_map[name]
+        # 2. Component Name Match (Loop search)
+        for cls in self.processor_cls_map.values():
+            if getattr(cls, "component_name", "") == name:
+                return cls
+        return None

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/deduplicator.py RENAMED Viewed

@@ -1,11 +1,13 @@
 import json
 from typing import List, Set
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..interfaces.base_processor import BaseProcessor
+@register_component("processor")
 class Deduplicator(BaseProcessor):
     """
     (Tier 2) Removes duplicate blocks based on content hashing.
@@ -16,6 +18,12 @@ class Deduplicator(BaseProcessor):
     component_name = "Deduplicator"
+    @classmethod
+    def can_handle(cls, blocks: list) -> float:
+        if isinstance(blocks, list) and len(blocks) > 1:
+            return 1.0
+        return 0.0
     def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
         """
         Iterate through blocks and remove duplicates.

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/imputer.py RENAMED Viewed

@@ -1,10 +1,12 @@
 from typing import Any, Dict, List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..interfaces.base_processor import BaseProcessor
+@register_component("processor")
 class Imputer(BaseProcessor):
     """
     (Tier 2) Fills missing values in 'record' type blocks using defined rules.
@@ -14,6 +16,12 @@ class Imputer(BaseProcessor):
     component_name = "Imputer"
+    @classmethod
+    def can_handle(cls, blocks: list) -> float:
+        if super().can_handle(blocks) > 0:
+            return 0.8
+        return 0.0
     def initialize(self, imputation_rules: Dict[str, Any] = None, **kwargs):
         """
         Set imputation rules.

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/outlier_handler.py RENAMED Viewed

@@ -1,10 +1,12 @@
 from typing import Any, Dict, List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..interfaces.base_processor import BaseProcessor
+@register_component("processor")
 class OutlierHandler(BaseProcessor):
     """
     (Tier 2) Handles numerical outliers in 'record' blocks.
@@ -15,6 +17,10 @@ class OutlierHandler(BaseProcessor):
     component_name = "OutlierHandler"
+    @classmethod
+    def can_handle(cls, blocks: list) -> float:
+        return 0.8 if super().can_handle(blocks) > 0 else 0.0
     def initialize(self, outlier_rules: Dict[str, Dict[str, Any]] = None, **kwargs):
         """
         Set outlier handling rules.

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/pii_masker.py RENAMED Viewed

@@ -1,11 +1,13 @@
 import re
 from typing import List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..interfaces.base_processor import BaseProcessor
+@register_component("processor")
 class PiiMasker(BaseProcessor):
     """
     (Tier 2) Masks Personally Identifiable Information (PII) in text blocks.
@@ -16,6 +18,10 @@ class PiiMasker(BaseProcessor):
     component_name = "PiiMasker"
+    @classmethod
+    def can_handle(cls, blocks: list) -> float:
+        return 1.0 if super().can_handle(blocks) > 0 else 0.0
     def initialize(self, mask_email: bool = True, mask_phone: bool = True, **kwargs):
         """
         Configure masking targets.

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/processor/text_cleaner.py RENAMED Viewed

@@ -1,11 +1,13 @@
 import re
 from typing import List
+from sayou.core.registry import register_component
 from sayou.core.schemas import SayouBlock
 from ..interfaces.base_processor import BaseProcessor
+@register_component("processor")
 class TextCleaner(BaseProcessor):
     """
     (Tier 2) Cleans text content using regex and whitespace normalization.
@@ -15,6 +17,12 @@ class TextCleaner(BaseProcessor):
     component_name = "TextCleaner"
+    @classmethod
+    def can_handle(cls, blocks: list) -> float:
+        if super().can_handle(blocks) > 0:
+            return 1.0
+        return 0.0
     def initialize(
         self, patterns: List[str] = None, normalize_space: bool = True, **kwargs
     ):

sayou_refinery-0.3.1/tests/test_refinery.py ADDED Viewed

@@ -0,0 +1,121 @@
+import unittest
+from sayou.refinery.pipeline import RefineryPipeline
+class TestRefineryPipeline(unittest.TestCase):
+    def setUp(self):
+        # 테스트 전 레지스트리가 로드되었는지 확인 (혹은 process 내부에서 로드됨)
+        pass
+    def test_doc_markdown_normalization_with_dict(self):
+        """
+        [Normalizer] Document 스키마를 가진 Dict가 Markdown 블록으로 변환되는지 확인 (Duck Typing)
+        """
+        # Document 객체 대신 Raw Dictionary 사용 (Refinery의 독립성 검증)
+        raw_doc = {
+            "doc_type": "pdf",
+            "pages": [
+                {
+                    "elements": [
+                        {
+                            "type": "text",
+                            "text": "Contact: test@test.com",
+                            "meta": {"semantic_type": "heading"},
+                        }
+                    ]
+                }
+            ],
+        }
+        # [1-Line Magic] 생성+실행
+        # strategy="standard_doc" -> DocMarkdownNormalizer 강제 선택
+        blocks = RefineryPipeline.process(raw_doc, strategy="standard_doc")
+        print(f"\n[Test 1] Generated Blocks: {blocks}")
+        self.assertTrue(len(blocks) > 0)
+        self.assertEqual(blocks[0].type, "md")
+        self.assertIn("Contact:", blocks[0].content)
+    def test_deduplication_processor(self):
+        """
+        [Processor] 중복 제거 프로세서가 명시적으로 호출되어 작동하는지 확인
+        """
+        raw_doc = {
+            "pages": [
+                {
+                    "elements": [
+                        {"type": "text", "text": "Unique Line"},
+                        {"type": "text", "text": "Dup Line"},
+                        {"type": "text", "text": "Dup Line"},  # 중복
+                    ]
+                }
+            ]
+        }
+        # [Fix] processors=["Deduplicator"] 명시
+        blocks = RefineryPipeline.process(
+            raw_doc, strategy="standard_doc", processors=["Deduplicator"]
+        )
+        # 3개 입력 -> 2개 출력 (중복 제거 성공)
+        self.assertEqual(len(blocks), 2)
+        content_list = [b.content for b in blocks]
+        self.assertEqual(content_list.count("Dup Line"), 1)
+    def test_record_processing_chain(self):
+        """
+        [Chain] JSON 레코드 -> (결측치 채우기) -> (이상치 제거) 체인 테스트
+        """
+        raw_records = [
+            {"id": 1, "category": None, "score": 50},  # Imputation 대상
+            {"id": 2, "category": "A", "score": 200},  # Outlier 대상 (max: 100)
+            {"id": 3, "category": "B", "score": 90},  # 정상
+        ]
+        # [Config] 런타임 설정 주입
+        config = {
+            "imputation_rules": {"category": "General"},  # None -> 'General'
+            "outlier_rules": {"score": {"max": 100, "action": "drop"}},  # 100 초과 삭제
+        }
+        # [Execution]
+        blocks = RefineryPipeline.process(
+            raw_records,
+            strategy="json",  # RecordNormalizer 선택
+            processors=["Imputer", "OutlierHandler"],  # 순서대로 실행
+            **config,  # 설정 주입
+        )
+        # 3개 입력 -> 2개 출력 (id:2 제거됨)
+        self.assertEqual(len(blocks), 2)
+        # Imputer 결과 확인 (None -> General)
+        block1_data = blocks[0].content  # RecordNormalizer는 dict를 content로 가짐
+        self.assertEqual(block1_data["category"], "General")
+        # Outlier 결과 확인 (id:2 없음)
+        ids = [b.content["id"] for b in blocks]
+        self.assertNotIn(2, ids)
+        self.assertIn(3, ids)
+    def test_auto_routing_html(self):
+        """
+        [Auto] strategy 미지정 시 HTML 감지 확인
+        """
+        raw_html = "<html><body><div>Hello World</div></body></html>"
+        # strategy="auto" (기본값)
+        blocks = RefineryPipeline.process(raw_html)
+        # HtmlTextNormalizer가 선택되어 텍스트를 추출했어야 함
+        self.assertTrue(len(blocks) > 0)
+        self.assertEqual(blocks[0].type, "text")
+        self.assertIn("Hello World", blocks[0].content)
+if __name__ == "__main__":
+    unittest.main()

sayou_refinery-0.2.0/src/sayou/refinery/pipeline.py DELETED Viewed

@@ -1,109 +0,0 @@
-from typing import Any, Dict, List, Optional
-from sayou.core.base_component import BaseComponent
-from sayou.core.decorators import safe_run
-from sayou.core.schemas import SayouBlock
-from .core.exceptions import RefineryError
-from .interfaces.base_normalizer import BaseNormalizer
-from .interfaces.base_processor import BaseProcessor
-from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
-from .normalizer.html_text_normalizer import HtmlTextNormalizer
-from .normalizer.record_normalizer import RecordNormalizer
-from .processor.deduplicator import Deduplicator
-from .processor.imputer import Imputer
-from .processor.outlier_handler import OutlierHandler
-from .processor.pii_masker import PiiMasker
-from .processor.text_cleaner import TextCleaner
-class RefineryPipeline(BaseComponent):
-    """
-    Orchestrates the data refinement process.
-    1. Selects a Normalizer to convert raw data into standard SayouBlocks.
-    2. Runs a chain of Processors to clean and transform the blocks.
-    """
-    component_name = "RefineryPipeline"
-    def __init__(
-        self,
-        extra_normalizers: Optional[List[BaseNormalizer]] = None,
-        processors: Optional[List[BaseProcessor]] = None,
-    ):
-        super().__init__()
-        self.normalizers: Dict[str, BaseNormalizer] = {}
-        # 1. Register Default Normalizers
-        defaults = [DocMarkdownNormalizer(), HtmlTextNormalizer(), RecordNormalizer()]
-        self._register(defaults)
-        # 2. Register User Extras
-        if extra_normalizers:
-            self._register(extra_normalizers)
-        # 3. Setup Processors Chain
-        self.processors = (
-            processors
-            if processors is not None
-            else [
-                TextCleaner(),
-                PiiMasker(),
-                Deduplicator(),
-                Imputer(),
-                OutlierHandler(),
-            ]
-        )
-    def _register(self, comps: List[BaseNormalizer]):
-        for c in comps:
-            for t in getattr(c, "SUPPORTED_TYPES", []):
-                self.normalizers[t] = c
-    @safe_run(default_return=None)
-    def initialize(self, **kwargs):
-        """
-        Initialize all sub-components (Normalizers and Processors).
-        Passes global configuration (like PII masking rules) down to components.
-        """
-        for norm in set(self.normalizers.values()):
-            norm.initialize(**kwargs)
-        for proc in self.processors:
-            proc.initialize(**kwargs)
-        self._log(
-            f"Refinery initialized with {len(self.processors)} processors in chain."
-        )
-    def run(self, raw_data: Any, source_type: str = "standard_doc") -> List[SayouBlock]:
-        """
-        Execute the refinement pipeline.
-        Args:
-            raw_data: The raw input data (dict, html string, db row list, etc.)
-            source_type: The type of input data (e.g., 'standard_doc', 'html', 'json')
-        Returns:
-            List[SayouBlock]: A list of clean, normalized blocks.
-        """
-        # Step 1: Normalize (Structure Transformation)
-        normalizer = self.normalizers.get(source_type)
-        if not normalizer:
-            supported = list(self.normalizers.keys())
-            raise RefineryError(
-                f"Unknown source_type '{source_type}'. Supported: {supported}"
-            )
-        try:
-            blocks = normalizer.normalize(raw_data)
-        except Exception as e:
-            self.logger.error(f"Normalization step failed: {e}")
-            return []
-        # Step 2: Process (Content Cleaning)
-        # Processors modify blocks in-place or return new lists
-        for processor in self.processors:
-            blocks = processor.process(blocks)
-        return blocks

sayou_refinery-0.2.0/tests/test_refinery.py DELETED Viewed

@@ -1,83 +0,0 @@
-import unittest
-from sayou.refinery.pipeline import RefineryPipeline
-class TestRefineryPipeline(unittest.TestCase):
-    def setUp(self):
-        self.pipeline = RefineryPipeline()
-        self.pipeline.initialize(
-            mask_email=True,
-            imputation_rules={"tag": "general"},
-            outlier_rules={"score": {"max": 100, "action": "drop"}},
-        )
-    def test_doc_markdown_normalization(self):
-        """[Normalizer] 문서 딕셔너리가 Markdown 블록으로 잘 변환되고 마스킹되는지 확인"""
-        raw_doc = {
-            "pages": [
-                {
-                    "elements": [
-                        {
-                            "type": "text",
-                            "text": "Contact: test@test.com",
-                            "raw_attributes": {"semantic_type": "heading"},
-                        }
-                    ]
-                }
-            ]
-        }
-        blocks = self.pipeline.run(raw_doc, source_type="standard_doc")
-        self.assertEqual(len(blocks), 1)
-        self.assertEqual(blocks[0].type, "md")
-        self.assertEqual(blocks[0].content, "# Contact: [EMAIL]")
-    def test_deduplication(self):
-        """[Processor] 중복된 텍스트 블록이 제거되는지 확인"""
-        raw_doc = {
-            "pages": [
-                {
-                    "elements": [
-                        {"type": "text", "text": "Unique Line Content"},
-                        {"type": "text", "text": "Duplicate Line Content"},
-                        {"type": "text", "text": "Duplicate Line Content"},
-                    ]
-                }
-            ]
-        }
-        blocks = self.pipeline.run(raw_doc, source_type="standard_doc")
-        # 3개 입력 -> 2개 출력 (중복 제거)
-        self.assertEqual(len(blocks), 2)
-        content_list = [b.content for b in blocks]
-        self.assertIn("Unique Line Content", content_list)
-        self.assertEqual(content_list.count("Duplicate Line Content"), 1)
-    def test_record_processing(self):
-        """[Processor] 레코드의 결측치 채우기와 이상치 제거 확인"""
-        raw_records = [
-            {"id": 1, "tag": None, "score": 50},  # Imputation 대상
-            {"id": 2, "tag": "A", "score": 200},  # Outlier (Drop) 대상 (max: 100)
-            {"id": 3, "tag": "B", "score": 90},  # 정상
-        ]
-        blocks = self.pipeline.run(raw_records, source_type="json")
-        # 3개 입력 -> 2개 출력
-        self.assertEqual(len(blocks), 2)
-        block1 = blocks[0].content
-        self.assertEqual(block1["tag"], "general")
-        ids = [b.content["id"] for b in blocks]
-        self.assertNotIn(2, ids)
-        self.assertIn(3, ids)
-if __name__ == "__main__":
-    unittest.main()

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/.gitignore RENAMED Viewed

File without changes

{sayou_refinery-0.2.0 → sayou_refinery-0.3.1}/src/sayou/refinery/core/exceptions.py RENAMED Viewed

File without changes

sayou-refinery 0.2.0__tar.gz → 0.3.1__tar.gz

sayou-refinery 0.2.0tar.gz → 0.3.1tar.gz