PyPI - sayou-refinery - Versions diffs - 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sayou-refinery 0.1.6py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

sayou/refinery/__init__.py +21 -0
sayou/refinery/core/exceptions.py +1 -1
sayou/refinery/interfaces/base_normalizer.py +29 -8
sayou/refinery/interfaces/base_processor.py +29 -9
sayou/refinery/normalizer/doc_markdown_normalizer.py +107 -39
sayou/refinery/normalizer/html_text_normalizer.py +36 -10
sayou/refinery/normalizer/record_normalizer.py +26 -9
sayou/refinery/pipeline.py +251 -63
sayou/refinery/processor/deduplicator.py +14 -5
sayou/refinery/processor/imputer.py +13 -4
sayou/refinery/processor/outlier_handler.py +11 -4
sayou/refinery/processor/pii_masker.py +11 -4
sayou/refinery/processor/text_cleaner.py +13 -4
{sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/METADATA +6 -6
sayou_refinery-0.3.3.dist-info/RECORD +16 -0
sayou/refinery/core/schemas.py +0 -27
sayou_refinery-0.1.6.dist-info/RECORD +0 -16
{sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/WHEEL +0 -0

sayou/refinery/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from .pipeline import RefineryPipeline
+from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
+from .normalizer.html_text_normalizer import HtmlTextNormalizer
+from .normalizer.record_normalizer import RecordNormalizer
+from .processor.deduplicator import Deduplicator
+from .processor.imputer import Imputer
+from .processor.outlier_handler import OutlierHandler
+from .processor.pii_masker import PiiMasker
+from .processor.text_cleaner import TextCleaner
+__all__ = [
+    "RefineryPipeline",
+    "DocMarkdownNormalizer",
+    "HtmlTextNormalizer",
+    "RecordNormalizer",
+    "Deduplicator",
+    "Imputer",
+    "OutlierHandler",
+    "PiiMasker",
+    "TextCleaner",
+]

sayou/refinery/core/exceptions.py CHANGED Viewed

@@ -11,7 +11,7 @@ class RefineryError(SayouCoreError):
 class NormalizationError(RefineryError):
     """
-    Raised when raw data cannot be converted to ContentBlocks.
+    Raised when raw data cannot be converted to SayouBlocks.
     (e.g., Malformed JSON, Unsupported format)
     """

sayou/refinery/interfaces/base_normalizer.py CHANGED Viewed

@@ -3,24 +3,38 @@ from typing import Any, List
 from sayou.core.base_component import BaseComponent
 from sayou.core.decorators import measure_time
+from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
-from ..core.schemas import ContentBlock
 class BaseNormalizer(BaseComponent):
     """
-    (Tier 1) Abstract base class for converting raw input into ContentBlocks.
+    (Tier 1) Abstract base class for converting raw input into SayouBlock.
     Normalizers are responsible for structural transformation:
-    Raw Data (JSON, HTML, DB Row) -> List[ContentBlock]
+    Raw Data (JSON, HTML, DB Row) -> List[SayouBlock]
     """
     component_name = "BaseNormalizer"
     SUPPORTED_TYPES = []
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        """
+        Determines if this normalizer can handle the raw input data.
+        Args:
+            raw_data: The input data (dict, str, Document object, etc.)
+            strategy: Explicit type hint from user (e.g. 'html', 'json')
+        Returns:
+            float: Confidence score (0.0 to 1.0)
+        """
+        return 0.0
     @measure_time
-    def normalize(self, raw_data: Any) -> List[ContentBlock]:
+    def normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
         Execute the normalization process.
@@ -28,20 +42,27 @@ class BaseNormalizer(BaseComponent):
             raw_data: The raw input data from Connector or Document.
         Returns:
-            List[ContentBlock]: A list of normalized content blocks.
+            List[SayouBlock]: A list of normalized content blocks.
         Raises:
             NormalizationError: If transformation fails.
         """
+        self._emit("on_start", input_data={"type": type(raw_data).__name__})
         self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
         try:
             blocks = self._do_normalize(raw_data)
+            self._emit("on_finish", result_data={"blocks": len(blocks)}, success=True)
             if not isinstance(blocks, list):
                 raise NormalizationError(f"Output must be a list, got {type(blocks)}")
             return blocks
         except Exception as e:
+            self._emit("on_error", error=e)
             wrapped_error = NormalizationError(
                 f"[{self.component_name}] Failed: {str(e)}"
             )
@@ -49,14 +70,14 @@ class BaseNormalizer(BaseComponent):
             raise wrapped_error
     @abstractmethod
-    def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
+    def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
-        [Abstract Hook] Implement logic to convert specific raw format to ContentBlocks.
+        [Abstract Hook] Implement logic to convert specific raw format to SayouBlocks.
         Args:
             raw_data: The raw input.
         Returns:
-            List[ContentBlock]: The standardized blocks.
+            List[SayouBlock]: The standardized blocks.
         """
         raise NotImplementedError

sayou/refinery/interfaces/base_processor.py CHANGED Viewed

@@ -3,14 +3,14 @@ from typing import List
 from sayou.core.base_component import BaseComponent
 from sayou.core.decorators import measure_time
+from sayou.core.schemas import SayouBlock
 from ..core.exceptions import ProcessingError
-from ..core.schemas import ContentBlock
 class BaseProcessor(BaseComponent):
     """
-    (Tier 1) Abstract base class for processing/cleaning ContentBlocks.
+    (Tier 1) Abstract base class for processing/cleaning SayouBlock.
     Processors operate on data that is already normalized. They can modify content
     (e.g., PII masking, Imputation) or filter out blocks (e.g., Deduplication).
@@ -18,40 +18,60 @@ class BaseProcessor(BaseComponent):
     component_name = "BaseProcessor"
+    @classmethod
+    def can_handle(cls, blocks: List[SayouBlock]) -> float:
+        """
+        Processors are usually explicitly chained, but this allows for
+        future smart-selection (e.g., auto-detecting PII).
+        """
+        if (
+            isinstance(blocks, list)
+            and len(blocks) > 0
+            and isinstance(blocks[0], SayouBlock)
+        ):
+            return 0.5
+        return 0.0
     @measure_time
-    def process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
+    def process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
         """
         Execute the processing logic on a list of blocks.
         Args:
-            blocks: Input list of ContentBlocks.
+            blocks: Input list of SayouBlocks.
         Returns:
-            List[ContentBlock]: Processed list of ContentBlocks.
+            List[SayouBlock]: Processed list of SayouBlocks.
         Raises:
             ProcessingError: If processing logic fails.
         """
+        self._emit("on_start", input_data={"blocks": len(blocks)})
         try:
             if not blocks:
                 return []
-            return self._do_process(blocks)
+            result = self._do_process(blocks)
+            self._emit("on_finish", result_data={"blocks": len(result)}, success=True)
+            return result
         except Exception as e:
+            self._emit("on_error", error=e)
             wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
             self.logger.error(wrapped_error, exc_info=True)
             raise wrapped_error
     @abstractmethod
-    def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
+    def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
         """
         [Abstract Hook] Implement cleaning/filtering logic.
         Args:
-            blocks: List of input ContentBlocks.
+            blocks: List of input SayouBlocks.
         Returns:
-            List[ContentBlock]: Modified list of ContentBlocks.
+            List[SayouBlock]: Modified list of SayouBlocks.
         """
         raise NotImplementedError

sayou/refinery/normalizer/doc_markdown_normalizer.py CHANGED Viewed

@@ -1,13 +1,16 @@
 from typing import Any, Dict, List
+from sayou.core.registry import register_component
+from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
-from ..core.schemas import ContentBlock
 from ..interfaces.base_normalizer import BaseNormalizer
+@register_component("normalizer")
 class DocMarkdownNormalizer(BaseNormalizer):
     """
-    (Tier 2) Normalizes a Sayou Document Dictionary into Markdown ContentBlocks.
+    (Tier 2) Normalizes a Sayou Document Dictionary into Markdown SayouBlocks.
     This engine parses the structured dictionary output from 'sayou-document' and
     converts individual elements (Text, Table, Image, Chart) into semantically
@@ -17,6 +20,24 @@ class DocMarkdownNormalizer(BaseNormalizer):
     component_name = "DocMarkdownNormalizer"
     SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        if strategy in ["markdown", "standard_doc"]:
+            return 1.0
+        if hasattr(raw_data, "doc_type") and hasattr(raw_data, "pages"):
+            return 1.0
+        if isinstance(raw_data, str):
+            if any(
+                line.strip().startswith(("#", "-", "* "))
+                for line in raw_data.splitlines()[:10]
+            ):
+                return 0.8
+            return 0.1
+        return 0.0
     def initialize(
         self,
         include_headers: bool = True,
@@ -35,7 +56,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
         self.include_headers = include_headers
         self.include_footers = include_footers
-    def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
+    def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
         Execute the normalization logic on the document dictionary.
@@ -43,47 +64,94 @@ class DocMarkdownNormalizer(BaseNormalizer):
             raw_data (Any): The input dictionary adhering to Sayou Document Schema.
         Returns:
-            List[ContentBlock]: A list of normalized content blocks (mostly 'md' type).
+            List[SayouBlock]: A list of normalized content blocks (mostly 'md' type).
         Raises:
             NormalizationError: If `raw_data` is not a valid dictionary.
         """
-        if not isinstance(raw_data, dict):
+        # 1. Input Handling (Dict/Object/Str Safe Conversion)
+        if isinstance(raw_data, str):
+            return [SayouBlock(type="md", content=raw_data, metadata={})]
+        # Handle Pydantic models or objects safely
+        if hasattr(raw_data, "model_dump"):
+            doc_data = raw_data.model_dump()
+        elif hasattr(raw_data, "dict"):
+            doc_data = raw_data.dict()
+        elif hasattr(raw_data, "__dict__"):
+            doc_data = raw_data.__dict__
+        elif isinstance(raw_data, dict):
+            doc_data = raw_data
+        else:
             raise NormalizationError(
-                f"Input must be a Dictionary, got {type(raw_data).__name__}"
+                f"Input must be convertible to Dictionary, got {type(raw_data).__name__}"
             )
-        doc_data = raw_data
-        blocks: List[ContentBlock] = []
+        normalized_blocks: List[SayouBlock] = []
-        if "metadata" in doc_data and doc_data["metadata"]:
-            blocks.extend(self._handle_doc_metadata(doc_data))
+        doc_meta = doc_data.get("metadata", {})
-        for page in doc_data.get("pages", []):
-            if self.include_headers and "header_elements" in page:
-                for element in page.get("header_elements", []):
-                    blocks.extend(
-                        self._handle_element(element, is_header=True, is_footer=False)
-                    )
+        def sanitize_text(text: str) -> str:
+            if not text:
+                return ""
+            text = text.replace("\x0b", "\n")
+            text = text.replace("\r", "\n")
+            text = text.replace("\f", "\n")
+            return text
-            for element in page.get("elements", []):
-                blocks.extend(
-                    self._handle_element(element, is_header=False, is_footer=False)
+        # 2. Iterate Pages
+        for page in doc_data.get("pages", []):
+            page_content_buffer = []
+            page_num = page.get("page_index", 0)
+            # Helper to extract text from elements using existing logic
+            def collect_text(elements, is_header=False, is_footer=False):
+                if not elements:
+                    return
+                for element in elements:
+                    sub_blocks = self._handle_element(element, is_header, is_footer)
+                    for sb in sub_blocks:
+                        if sb.content and sb.content.strip():
+                            clean_content = sanitize_text(sb.content.strip())
+                            page_content_buffer.append(clean_content)
+            # A. Header Elements
+            if self.include_headers:
+                collect_text(page.get("header_elements", []), is_header=True)
+            # B. Body Elements (Main Content)
+            collect_text(page.get("elements", []), is_header=False)
+            # C. Footer Elements
+            if self.include_footers:
+                collect_text(page.get("footer_elements", []), is_footer=True)
+            # 3. Aggregate: Create ONE Block per Page
+            if page_content_buffer:
+                full_page_text = "\n\n".join(page_content_buffer)
+                block_meta = doc_meta.copy()
+                block_meta.update(
+                    {
+                        "page_num": page_num,
+                        "origin_type": "page_aggregated",
+                        "source": doc_meta.get("filename", "unknown"),
+                    }
                 )
-            if self.include_footers and "footer_elements" in page:
-                for element in page.get("footer_elements", []):
-                    # T2의 기본 규칙: include_footers가 True여도 _handle_element에서
-                    # is_footer=True 플래그를 보고 무시할 수 있음 (T3가 오버라이드 가능)
-                    blocks.extend(
-                        self._handle_element(element, is_header=False, is_footer=True)
+                normalized_blocks.append(
+                    SayouBlock(
+                        type="md",
+                        content=full_page_text,
+                        metadata=block_meta,
                     )
+                )
-        return blocks
+        return normalized_blocks
     def _handle_element(
         self, element: Dict[str, Any], is_header: bool, is_footer: bool
-    ) -> List[ContentBlock]:
+    ) -> List[SayouBlock]:
         """
         Dispatch the element to specific handlers based on its 'type' field.
@@ -93,7 +161,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
             is_footer (bool): True if the element is part of the page footer.
         Returns:
-            List[ContentBlock]: The resulting block(s) from the element.
+            List[SayouBlock]: The resulting block(s) from the element.
         """
         if is_footer and not self.include_footers:
             return []
@@ -114,7 +182,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
         return []
-    def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[ContentBlock]:
+    def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[SayouBlock]:
         """
         Convert document-level metadata into a Markdown Frontmatter block.
@@ -122,7 +190,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
             doc_data (Dict[str, Any]): The root document dictionary containing 'metadata'.
         Returns:
-            List[ContentBlock]: A single block containing YAML-like frontmatter.
+            List[SayouBlock]: A single block containing YAML-like frontmatter.
         """
         md_frontmatter = "---\n"
         metadata = doc_data.get("metadata", {})
@@ -137,7 +205,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
         md_frontmatter += "---\n\n"
         return [
-            ContentBlock(
+            SayouBlock(
                 type="md",
                 content=md_frontmatter,
                 metadata={"page_num": 0, "id": "metadata", "is_footer": False},
@@ -146,7 +214,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
     def _handle_text(
         self, element: Dict[str, Any], is_header: bool, is_footer: bool
-    ) -> List[ContentBlock]:
+    ) -> List[SayouBlock]:
         """
         Convert a text element to a Markdown block, handling headings and lists.
@@ -183,7 +251,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
             content = text
         return [
-            ContentBlock(
+            SayouBlock(
                 type="md",
                 content=content,
                 metadata={
@@ -197,7 +265,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
     def _handle_table(
         self, element: Dict[str, Any], is_header: bool, is_footer: bool
-    ) -> List[ContentBlock]:
+    ) -> List[SayouBlock]:
         """
         Convert a table element into a Markdown table representation.
@@ -232,7 +300,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
             md_table += "| " + " | ".join(body_cells) + " |\n"
         return [
-            ContentBlock(
+            SayouBlock(
                 type="md",
                 content=md_table.strip(),
                 metadata={
@@ -245,7 +313,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
     def _handle_image(
         self, element: Dict[str, Any], is_header: bool, is_footer: bool
-    ) -> List[ContentBlock]:
+    ) -> List[SayouBlock]:
         """
         Process an image element.
@@ -266,7 +334,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
         img_format = element.get("image_format", "png")
         return [
-            ContentBlock(
+            SayouBlock(
                 type="image_base64",
                 content=image_base64,
                 metadata={
@@ -281,7 +349,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
     def _handle_chart(
         self, element: Dict[str, Any], is_header: bool, is_footer: bool
-    ) -> List[ContentBlock]:
+    ) -> List[SayouBlock]:
         """
         Convert a chart element into its text representation.
@@ -295,7 +363,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
         content = f"--- Chart Data ---\n{text_rep}\n--------------------\n"
         return [
-            ContentBlock(
+            SayouBlock(
                 type="md",
                 content=content,
                 metadata={

sayou/refinery/normalizer/html_text_normalizer.py CHANGED Viewed

@@ -5,14 +5,17 @@ except ImportError:
 from typing import Any, List
+from sayou.core.registry import register_component
+from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
-from ..core.schemas import ContentBlock
 from ..interfaces.base_normalizer import BaseNormalizer
+@register_component("normalizer")
 class HtmlTextNormalizer(BaseNormalizer):
     """
-    (Tier 2) Converts HTML string into a clean Text ContentBlock.
+    (Tier 2) Converts HTML string into a clean Text SayouBlock.
     Uses BeautifulSoup to strip tags, scripts, and styles, returning only
     the visible text content while preserving paragraph structure.
@@ -21,7 +24,20 @@ class HtmlTextNormalizer(BaseNormalizer):
     component_name = "HtmlTextNormalizer"
     SUPPORTED_TYPES = ["html"]
-    def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        if strategy in ["html"]:
+            return 1.0
+        if isinstance(raw_data, str):
+            sample = raw_data[:1000].lower()
+            if "<html" in sample or "<!doctype html" in sample:
+                return 1.0
+            if "<body" in sample or "<div" in sample:
+                return 0.95
+        return 0.0
+    def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
         Parse HTML and extract text.
@@ -29,7 +45,7 @@ class HtmlTextNormalizer(BaseNormalizer):
             raw_data (Any): The input HTML string.
         Returns:
-            List[ContentBlock]: A single block of type 'text'.
+            List[SayouBlock]: A single block of type 'text'.
         Raises:
             ImportError: If BeautifulSoup4 is not installed.
@@ -45,15 +61,25 @@ class HtmlTextNormalizer(BaseNormalizer):
         soup = BeautifulSoup(raw_data, "html.parser")
-        for tag in soup(["script", "style", "noscript", "iframe"]):
+        extracted_meta = {"strategy": "html_parsed"}
+        if soup.title and soup.title.string:
+            extracted_meta["title"] = soup.title.string.strip()
+            extracted_meta["subject"] = soup.title.string.strip()
+        for meta_tag in soup.find_all("meta"):
+            name = meta_tag.get("name") or meta_tag.get("property")
+            content = meta_tag.get("content")
+            if name and content:
+                extracted_meta[name] = content
+        for tag in soup(["script", "style", "noscript", "iframe", "head"]):
             tag.extract()
-        text = soup.get_text(separator="\n")
+        text_content = soup.get_text(separator="\n")
         import re
-        text = re.sub(r"\n{3,}", "\n\n", text).strip()
+        text_content = re.sub(r"\n{3,}", "\n\n", text_content).strip()
-        return [
-            ContentBlock(type="text", content=text, metadata={"source_type": "html"})
-        ]
+        return [SayouBlock(type="text", content=text_content, metadata=extracted_meta)]

sayou/refinery/normalizer/record_normalizer.py CHANGED Viewed

@@ -1,22 +1,39 @@
 from typing import Any, Dict, List
+from sayou.core.registry import register_component
+from sayou.core.schemas import SayouBlock
 from ..core.exceptions import NormalizationError
-from ..core.schemas import ContentBlock
 from ..interfaces.base_normalizer import BaseNormalizer
+@register_component("normalizer")
 class RecordNormalizer(BaseNormalizer):
     """
-    (Tier 2) Converts structured data (Dict/List) into 'record' ContentBlocks.
+    (Tier 2) Converts structured data (Dict/List) into 'record' SayouBlocks.
     Suitable for processing database rows, CSV records, or JSON API responses.
-    Each dictionary becomes a separate ContentBlock of type 'record'.
+    Each dictionary becomes a separate SayouBlock of type 'record'.
     """
     component_name = "RecordNormalizer"
     SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
-    def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
+    @classmethod
+    def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
+        if strategy in ["json", "record", "db", "dict"]:
+            return 1.0
+        if isinstance(raw_data, dict):
+            return 0.9
+        if isinstance(raw_data, list):
+            if len(raw_data) > 0 and isinstance(raw_data[0], dict):
+                return 0.9
+            return 0.1
+        return 0.0
+    def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
         """
         Convert dict or list of dicts into record blocks.
@@ -24,7 +41,7 @@ class RecordNormalizer(BaseNormalizer):
             raw_data (Any): A Dictionary or a List of Dictionaries.
         Returns:
-            List[ContentBlock]: Blocks of type 'record'.
+            List[SayouBlock]: Blocks of type 'record'.
         """
         blocks = []
@@ -49,17 +66,17 @@ class RecordNormalizer(BaseNormalizer):
         return blocks
-    def _create_block(self, data: Dict[str, Any]) -> ContentBlock:
+    def _create_block(self, data: Dict[str, Any]) -> SayouBlock:
         """
-        Helper to wrap a single dictionary into a ContentBlock.
+        Helper to wrap a single dictionary into a SayouBlock.
         Args:
             data (Dict[str, Any]): The data record.
         Returns:
-            ContentBlock: A block with type='record' and content=data.
+            SayouBlock: A block with type='record' and content=data.
         """
-        return ContentBlock(
+        return SayouBlock(
             type="record",
             content=data,
             metadata={"fields": list(data.keys())},

sayou-refinery 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl

sayou-refinery 0.1.6py3-none-any.whl → 0.3.3py3-none-any.whl