sayou-refinery 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ from .pipeline import RefineryPipeline
2
+ from .normalizer.doc_markdown_normalizer import DocMarkdownNormalizer
3
+ from .normalizer.html_text_normalizer import HtmlTextNormalizer
4
+ from .normalizer.record_normalizer import RecordNormalizer
5
+ from .processor.deduplicator import Deduplicator
6
+ from .processor.imputer import Imputer
7
+ from .processor.outlier_handler import OutlierHandler
8
+ from .processor.pii_masker import PiiMasker
9
+ from .processor.text_cleaner import TextCleaner
10
+
11
+ __all__ = [
12
+ "RefineryPipeline",
13
+ "DocMarkdownNormalizer",
14
+ "HtmlTextNormalizer",
15
+ "RecordNormalizer",
16
+ "Deduplicator",
17
+ "Imputer",
18
+ "OutlierHandler",
19
+ "PiiMasker",
20
+ "TextCleaner",
21
+ ]
@@ -11,7 +11,7 @@ class RefineryError(SayouCoreError):
11
11
 
12
12
  class NormalizationError(RefineryError):
13
13
  """
14
- Raised when raw data cannot be converted to ContentBlocks.
14
+ Raised when raw data cannot be converted to SayouBlocks.
15
15
  (e.g., Malformed JSON, Unsupported format)
16
16
  """
17
17
 
@@ -3,24 +3,38 @@ from typing import Any, List
3
3
 
4
4
  from sayou.core.base_component import BaseComponent
5
5
  from sayou.core.decorators import measure_time
6
+ from sayou.core.schemas import SayouBlock
6
7
 
7
8
  from ..core.exceptions import NormalizationError
8
- from ..core.schemas import ContentBlock
9
9
 
10
10
 
11
11
  class BaseNormalizer(BaseComponent):
12
12
  """
13
- (Tier 1) Abstract base class for converting raw input into ContentBlocks.
13
+ (Tier 1) Abstract base class for converting raw input into SayouBlock.
14
14
 
15
15
  Normalizers are responsible for structural transformation:
16
- Raw Data (JSON, HTML, DB Row) -> List[ContentBlock]
16
+ Raw Data (JSON, HTML, DB Row) -> List[SayouBlock]
17
17
  """
18
18
 
19
19
  component_name = "BaseNormalizer"
20
20
  SUPPORTED_TYPES = []
21
21
 
22
+ @classmethod
23
+ def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
24
+ """
25
+ Determines if this normalizer can handle the raw input data.
26
+
27
+ Args:
28
+ raw_data: The input data (dict, str, Document object, etc.)
29
+ strategy: Explicit type hint from user (e.g. 'html', 'json')
30
+
31
+ Returns:
32
+ float: Confidence score (0.0 to 1.0)
33
+ """
34
+ return 0.0
35
+
22
36
  @measure_time
23
- def normalize(self, raw_data: Any) -> List[ContentBlock]:
37
+ def normalize(self, raw_data: Any) -> List[SayouBlock]:
24
38
  """
25
39
  Execute the normalization process.
26
40
 
@@ -28,20 +42,27 @@ class BaseNormalizer(BaseComponent):
28
42
  raw_data: The raw input data from Connector or Document.
29
43
 
30
44
  Returns:
31
- List[ContentBlock]: A list of normalized content blocks.
45
+ List[SayouBlock]: A list of normalized content blocks.
32
46
 
33
47
  Raises:
34
48
  NormalizationError: If transformation fails.
35
49
  """
50
+ self._emit("on_start", input_data={"type": type(raw_data).__name__})
51
+
36
52
  self._log(f"Normalizing data (Type: {type(raw_data).__name__})")
53
+
37
54
  try:
38
55
  blocks = self._do_normalize(raw_data)
56
+
57
+ self._emit("on_finish", result_data={"blocks": len(blocks)}, success=True)
58
+
39
59
  if not isinstance(blocks, list):
40
60
  raise NormalizationError(f"Output must be a list, got {type(blocks)}")
41
61
 
42
62
  return blocks
43
63
 
44
64
  except Exception as e:
65
+ self._emit("on_error", error=e)
45
66
  wrapped_error = NormalizationError(
46
67
  f"[{self.component_name}] Failed: {str(e)}"
47
68
  )
@@ -49,14 +70,14 @@ class BaseNormalizer(BaseComponent):
49
70
  raise wrapped_error
50
71
 
51
72
  @abstractmethod
52
- def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
73
+ def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
53
74
  """
54
- [Abstract Hook] Implement logic to convert specific raw format to ContentBlocks.
75
+ [Abstract Hook] Implement logic to convert specific raw format to SayouBlocks.
55
76
 
56
77
  Args:
57
78
  raw_data: The raw input.
58
79
 
59
80
  Returns:
60
- List[ContentBlock]: The standardized blocks.
81
+ List[SayouBlock]: The standardized blocks.
61
82
  """
62
83
  raise NotImplementedError
@@ -3,14 +3,14 @@ from typing import List
3
3
 
4
4
  from sayou.core.base_component import BaseComponent
5
5
  from sayou.core.decorators import measure_time
6
+ from sayou.core.schemas import SayouBlock
6
7
 
7
8
  from ..core.exceptions import ProcessingError
8
- from ..core.schemas import ContentBlock
9
9
 
10
10
 
11
11
  class BaseProcessor(BaseComponent):
12
12
  """
13
- (Tier 1) Abstract base class for processing/cleaning ContentBlocks.
13
+ (Tier 1) Abstract base class for processing/cleaning SayouBlock.
14
14
 
15
15
  Processors operate on data that is already normalized. They can modify content
16
16
  (e.g., PII masking, Imputation) or filter out blocks (e.g., Deduplication).
@@ -18,40 +18,60 @@ class BaseProcessor(BaseComponent):
18
18
 
19
19
  component_name = "BaseProcessor"
20
20
 
21
+ @classmethod
22
+ def can_handle(cls, blocks: List[SayouBlock]) -> float:
23
+ """
24
+ Processors are usually explicitly chained, but this allows for
25
+ future smart-selection (e.g., auto-detecting PII).
26
+ """
27
+ if (
28
+ isinstance(blocks, list)
29
+ and len(blocks) > 0
30
+ and isinstance(blocks[0], SayouBlock)
31
+ ):
32
+ return 0.5
33
+ return 0.0
34
+
21
35
  @measure_time
22
- def process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
36
+ def process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
23
37
  """
24
38
  Execute the processing logic on a list of blocks.
25
39
 
26
40
  Args:
27
- blocks: Input list of ContentBlocks.
41
+ blocks: Input list of SayouBlocks.
28
42
 
29
43
  Returns:
30
- List[ContentBlock]: Processed list of ContentBlocks.
44
+ List[SayouBlock]: Processed list of SayouBlocks.
31
45
 
32
46
  Raises:
33
47
  ProcessingError: If processing logic fails.
34
48
  """
49
+ self._emit("on_start", input_data={"blocks": len(blocks)})
35
50
  try:
36
51
  if not blocks:
37
52
  return []
38
53
 
39
- return self._do_process(blocks)
54
+ result = self._do_process(blocks)
55
+
56
+ self._emit("on_finish", result_data={"blocks": len(result)}, success=True)
57
+
58
+ return result
40
59
 
41
60
  except Exception as e:
61
+ self._emit("on_error", error=e)
42
62
  wrapped_error = ProcessingError(f"[{self.component_name}] Failed: {str(e)}")
43
63
  self.logger.error(wrapped_error, exc_info=True)
44
64
  raise wrapped_error
45
65
 
46
66
  @abstractmethod
47
- def _do_process(self, blocks: List[ContentBlock]) -> List[ContentBlock]:
67
+ def _do_process(self, blocks: List[SayouBlock]) -> List[SayouBlock]:
48
68
  """
49
69
  [Abstract Hook] Implement cleaning/filtering logic.
50
70
 
51
71
  Args:
52
- blocks: List of input ContentBlocks.
72
+ blocks: List of input SayouBlocks.
53
73
 
54
74
  Returns:
55
- List[ContentBlock]: Modified list of ContentBlocks.
75
+ List[SayouBlock]: Modified list of SayouBlocks.
56
76
  """
57
77
  raise NotImplementedError
@@ -1,13 +1,16 @@
1
1
  from typing import Any, Dict, List
2
2
 
3
+ from sayou.core.registry import register_component
4
+ from sayou.core.schemas import SayouBlock
5
+
3
6
  from ..core.exceptions import NormalizationError
4
- from ..core.schemas import ContentBlock
5
7
  from ..interfaces.base_normalizer import BaseNormalizer
6
8
 
7
9
 
10
+ @register_component("normalizer")
8
11
  class DocMarkdownNormalizer(BaseNormalizer):
9
12
  """
10
- (Tier 2) Normalizes a Sayou Document Dictionary into Markdown ContentBlocks.
13
+ (Tier 2) Normalizes a Sayou Document Dictionary into Markdown SayouBlocks.
11
14
 
12
15
  This engine parses the structured dictionary output from 'sayou-document' and
13
16
  converts individual elements (Text, Table, Image, Chart) into semantically
@@ -17,6 +20,24 @@ class DocMarkdownNormalizer(BaseNormalizer):
17
20
  component_name = "DocMarkdownNormalizer"
18
21
  SUPPORTED_TYPES = ["standard_doc", "sayou_doc_json"]
19
22
 
23
+ @classmethod
24
+ def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
25
+ if strategy in ["markdown", "standard_doc"]:
26
+ return 1.0
27
+
28
+ if hasattr(raw_data, "doc_type") and hasattr(raw_data, "pages"):
29
+ return 1.0
30
+
31
+ if isinstance(raw_data, str):
32
+ if any(
33
+ line.strip().startswith(("#", "-", "* "))
34
+ for line in raw_data.splitlines()[:10]
35
+ ):
36
+ return 0.8
37
+ return 0.1
38
+
39
+ return 0.0
40
+
20
41
  def initialize(
21
42
  self,
22
43
  include_headers: bool = True,
@@ -35,7 +56,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
35
56
  self.include_headers = include_headers
36
57
  self.include_footers = include_footers
37
58
 
38
- def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
59
+ def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
39
60
  """
40
61
  Execute the normalization logic on the document dictionary.
41
62
 
@@ -43,47 +64,94 @@ class DocMarkdownNormalizer(BaseNormalizer):
43
64
  raw_data (Any): The input dictionary adhering to Sayou Document Schema.
44
65
 
45
66
  Returns:
46
- List[ContentBlock]: A list of normalized content blocks (mostly 'md' type).
67
+ List[SayouBlock]: A list of normalized content blocks (mostly 'md' type).
47
68
 
48
69
  Raises:
49
70
  NormalizationError: If `raw_data` is not a valid dictionary.
50
71
  """
51
- if not isinstance(raw_data, dict):
72
+ # 1. Input Handling (Dict/Object/Str Safe Conversion)
73
+ if isinstance(raw_data, str):
74
+ return [SayouBlock(type="md", content=raw_data, metadata={})]
75
+
76
+ # Handle Pydantic models or objects safely
77
+ if hasattr(raw_data, "model_dump"):
78
+ doc_data = raw_data.model_dump()
79
+ elif hasattr(raw_data, "dict"):
80
+ doc_data = raw_data.dict()
81
+ elif hasattr(raw_data, "__dict__"):
82
+ doc_data = raw_data.__dict__
83
+ elif isinstance(raw_data, dict):
84
+ doc_data = raw_data
85
+ else:
52
86
  raise NormalizationError(
53
- f"Input must be a Dictionary, got {type(raw_data).__name__}"
87
+ f"Input must be convertible to Dictionary, got {type(raw_data).__name__}"
54
88
  )
55
89
 
56
- doc_data = raw_data
57
- blocks: List[ContentBlock] = []
90
+ normalized_blocks: List[SayouBlock] = []
58
91
 
59
- if "metadata" in doc_data and doc_data["metadata"]:
60
- blocks.extend(self._handle_doc_metadata(doc_data))
92
+ doc_meta = doc_data.get("metadata", {})
61
93
 
62
- for page in doc_data.get("pages", []):
63
- if self.include_headers and "header_elements" in page:
64
- for element in page.get("header_elements", []):
65
- blocks.extend(
66
- self._handle_element(element, is_header=True, is_footer=False)
67
- )
94
+ def sanitize_text(text: str) -> str:
95
+ if not text:
96
+ return ""
97
+ text = text.replace("\x0b", "\n")
98
+ text = text.replace("\r", "\n")
99
+ text = text.replace("\f", "\n")
100
+ return text
68
101
 
69
- for element in page.get("elements", []):
70
- blocks.extend(
71
- self._handle_element(element, is_header=False, is_footer=False)
102
+ # 2. Iterate Pages
103
+ for page in doc_data.get("pages", []):
104
+ page_content_buffer = []
105
+ page_num = page.get("page_index", 0)
106
+
107
+ # Helper to extract text from elements using existing logic
108
+ def collect_text(elements, is_header=False, is_footer=False):
109
+ if not elements:
110
+ return
111
+ for element in elements:
112
+ sub_blocks = self._handle_element(element, is_header, is_footer)
113
+ for sb in sub_blocks:
114
+ if sb.content and sb.content.strip():
115
+ clean_content = sanitize_text(sb.content.strip())
116
+ page_content_buffer.append(clean_content)
117
+
118
+ # A. Header Elements
119
+ if self.include_headers:
120
+ collect_text(page.get("header_elements", []), is_header=True)
121
+
122
+ # B. Body Elements (Main Content)
123
+ collect_text(page.get("elements", []), is_header=False)
124
+
125
+ # C. Footer Elements
126
+ if self.include_footers:
127
+ collect_text(page.get("footer_elements", []), is_footer=True)
128
+
129
+ # 3. Aggregate: Create ONE Block per Page
130
+ if page_content_buffer:
131
+ full_page_text = "\n\n".join(page_content_buffer)
132
+
133
+ block_meta = doc_meta.copy()
134
+ block_meta.update(
135
+ {
136
+ "page_num": page_num,
137
+ "origin_type": "page_aggregated",
138
+ "source": doc_meta.get("filename", "unknown"),
139
+ }
72
140
  )
73
141
 
74
- if self.include_footers and "footer_elements" in page:
75
- for element in page.get("footer_elements", []):
76
- # T2의 기본 규칙: include_footers가 True여도 _handle_element에서
77
- # is_footer=True 플래그를 보고 무시할 수 있음 (T3가 오버라이드 가능)
78
- blocks.extend(
79
- self._handle_element(element, is_header=False, is_footer=True)
142
+ normalized_blocks.append(
143
+ SayouBlock(
144
+ type="md",
145
+ content=full_page_text,
146
+ metadata=block_meta,
80
147
  )
148
+ )
81
149
 
82
- return blocks
150
+ return normalized_blocks
83
151
 
84
152
  def _handle_element(
85
153
  self, element: Dict[str, Any], is_header: bool, is_footer: bool
86
- ) -> List[ContentBlock]:
154
+ ) -> List[SayouBlock]:
87
155
  """
88
156
  Dispatch the element to specific handlers based on its 'type' field.
89
157
 
@@ -93,7 +161,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
93
161
  is_footer (bool): True if the element is part of the page footer.
94
162
 
95
163
  Returns:
96
- List[ContentBlock]: The resulting block(s) from the element.
164
+ List[SayouBlock]: The resulting block(s) from the element.
97
165
  """
98
166
  if is_footer and not self.include_footers:
99
167
  return []
@@ -114,7 +182,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
114
182
 
115
183
  return []
116
184
 
117
- def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[ContentBlock]:
185
+ def _handle_doc_metadata(self, doc_data: Dict[str, Any]) -> List[SayouBlock]:
118
186
  """
119
187
  Convert document-level metadata into a Markdown Frontmatter block.
120
188
 
@@ -122,7 +190,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
122
190
  doc_data (Dict[str, Any]): The root document dictionary containing 'metadata'.
123
191
 
124
192
  Returns:
125
- List[ContentBlock]: A single block containing YAML-like frontmatter.
193
+ List[SayouBlock]: A single block containing YAML-like frontmatter.
126
194
  """
127
195
  md_frontmatter = "---\n"
128
196
  metadata = doc_data.get("metadata", {})
@@ -137,7 +205,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
137
205
  md_frontmatter += "---\n\n"
138
206
 
139
207
  return [
140
- ContentBlock(
208
+ SayouBlock(
141
209
  type="md",
142
210
  content=md_frontmatter,
143
211
  metadata={"page_num": 0, "id": "metadata", "is_footer": False},
@@ -146,7 +214,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
146
214
 
147
215
  def _handle_text(
148
216
  self, element: Dict[str, Any], is_header: bool, is_footer: bool
149
- ) -> List[ContentBlock]:
217
+ ) -> List[SayouBlock]:
150
218
  """
151
219
  Convert a text element to a Markdown block, handling headings and lists.
152
220
 
@@ -183,7 +251,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
183
251
  content = text
184
252
 
185
253
  return [
186
- ContentBlock(
254
+ SayouBlock(
187
255
  type="md",
188
256
  content=content,
189
257
  metadata={
@@ -197,7 +265,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
197
265
 
198
266
  def _handle_table(
199
267
  self, element: Dict[str, Any], is_header: bool, is_footer: bool
200
- ) -> List[ContentBlock]:
268
+ ) -> List[SayouBlock]:
201
269
  """
202
270
  Convert a table element into a Markdown table representation.
203
271
 
@@ -232,7 +300,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
232
300
  md_table += "| " + " | ".join(body_cells) + " |\n"
233
301
 
234
302
  return [
235
- ContentBlock(
303
+ SayouBlock(
236
304
  type="md",
237
305
  content=md_table.strip(),
238
306
  metadata={
@@ -245,7 +313,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
245
313
 
246
314
  def _handle_image(
247
315
  self, element: Dict[str, Any], is_header: bool, is_footer: bool
248
- ) -> List[ContentBlock]:
316
+ ) -> List[SayouBlock]:
249
317
  """
250
318
  Process an image element.
251
319
 
@@ -266,7 +334,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
266
334
  img_format = element.get("image_format", "png")
267
335
 
268
336
  return [
269
- ContentBlock(
337
+ SayouBlock(
270
338
  type="image_base64",
271
339
  content=image_base64,
272
340
  metadata={
@@ -281,7 +349,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
281
349
 
282
350
  def _handle_chart(
283
351
  self, element: Dict[str, Any], is_header: bool, is_footer: bool
284
- ) -> List[ContentBlock]:
352
+ ) -> List[SayouBlock]:
285
353
  """
286
354
  Convert a chart element into its text representation.
287
355
 
@@ -295,7 +363,7 @@ class DocMarkdownNormalizer(BaseNormalizer):
295
363
  content = f"--- Chart Data ---\n{text_rep}\n--------------------\n"
296
364
 
297
365
  return [
298
- ContentBlock(
366
+ SayouBlock(
299
367
  type="md",
300
368
  content=content,
301
369
  metadata={
@@ -5,14 +5,17 @@ except ImportError:
5
5
 
6
6
  from typing import Any, List
7
7
 
8
+ from sayou.core.registry import register_component
9
+ from sayou.core.schemas import SayouBlock
10
+
8
11
  from ..core.exceptions import NormalizationError
9
- from ..core.schemas import ContentBlock
10
12
  from ..interfaces.base_normalizer import BaseNormalizer
11
13
 
12
14
 
15
+ @register_component("normalizer")
13
16
  class HtmlTextNormalizer(BaseNormalizer):
14
17
  """
15
- (Tier 2) Converts HTML string into a clean Text ContentBlock.
18
+ (Tier 2) Converts HTML string into a clean Text SayouBlock.
16
19
 
17
20
  Uses BeautifulSoup to strip tags, scripts, and styles, returning only
18
21
  the visible text content while preserving paragraph structure.
@@ -21,7 +24,20 @@ class HtmlTextNormalizer(BaseNormalizer):
21
24
  component_name = "HtmlTextNormalizer"
22
25
  SUPPORTED_TYPES = ["html"]
23
26
 
24
- def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
27
+ @classmethod
28
+ def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
29
+ if strategy in ["html"]:
30
+ return 1.0
31
+
32
+ if isinstance(raw_data, str):
33
+ sample = raw_data[:1000].lower()
34
+ if "<html" in sample or "<!doctype html" in sample:
35
+ return 1.0
36
+ if "<body" in sample or "<div" in sample:
37
+ return 0.95
38
+ return 0.0
39
+
40
+ def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
25
41
  """
26
42
  Parse HTML and extract text.
27
43
 
@@ -29,7 +45,7 @@ class HtmlTextNormalizer(BaseNormalizer):
29
45
  raw_data (Any): The input HTML string.
30
46
 
31
47
  Returns:
32
- List[ContentBlock]: A single block of type 'text'.
48
+ List[SayouBlock]: A single block of type 'text'.
33
49
 
34
50
  Raises:
35
51
  ImportError: If BeautifulSoup4 is not installed.
@@ -45,15 +61,25 @@ class HtmlTextNormalizer(BaseNormalizer):
45
61
 
46
62
  soup = BeautifulSoup(raw_data, "html.parser")
47
63
 
48
- for tag in soup(["script", "style", "noscript", "iframe"]):
64
+ extracted_meta = {"strategy": "html_parsed"}
65
+
66
+ if soup.title and soup.title.string:
67
+ extracted_meta["title"] = soup.title.string.strip()
68
+ extracted_meta["subject"] = soup.title.string.strip()
69
+
70
+ for meta_tag in soup.find_all("meta"):
71
+ name = meta_tag.get("name") or meta_tag.get("property")
72
+ content = meta_tag.get("content")
73
+ if name and content:
74
+ extracted_meta[name] = content
75
+
76
+ for tag in soup(["script", "style", "noscript", "iframe", "head"]):
49
77
  tag.extract()
50
78
 
51
- text = soup.get_text(separator="\n")
79
+ text_content = soup.get_text(separator="\n")
52
80
 
53
81
  import re
54
82
 
55
- text = re.sub(r"\n{3,}", "\n\n", text).strip()
83
+ text_content = re.sub(r"\n{3,}", "\n\n", text_content).strip()
56
84
 
57
- return [
58
- ContentBlock(type="text", content=text, metadata={"source_type": "html"})
59
- ]
85
+ return [SayouBlock(type="text", content=text_content, metadata=extracted_meta)]
@@ -1,22 +1,39 @@
1
1
  from typing import Any, Dict, List
2
2
 
3
+ from sayou.core.registry import register_component
4
+ from sayou.core.schemas import SayouBlock
5
+
3
6
  from ..core.exceptions import NormalizationError
4
- from ..core.schemas import ContentBlock
5
7
  from ..interfaces.base_normalizer import BaseNormalizer
6
8
 
7
9
 
10
+ @register_component("normalizer")
8
11
  class RecordNormalizer(BaseNormalizer):
9
12
  """
10
- (Tier 2) Converts structured data (Dict/List) into 'record' ContentBlocks.
13
+ (Tier 2) Converts structured data (Dict/List) into 'record' SayouBlocks.
11
14
 
12
15
  Suitable for processing database rows, CSV records, or JSON API responses.
13
- Each dictionary becomes a separate ContentBlock of type 'record'.
16
+ Each dictionary becomes a separate SayouBlock of type 'record'.
14
17
  """
15
18
 
16
19
  component_name = "RecordNormalizer"
17
20
  SUPPORTED_TYPES = ["json", "dict", "db_row", "record"]
18
21
 
19
- def _do_normalize(self, raw_data: Any) -> List[ContentBlock]:
22
+ @classmethod
23
+ def can_handle(cls, raw_data: Any, strategy: str = "auto") -> float:
24
+ if strategy in ["json", "record", "db", "dict"]:
25
+ return 1.0
26
+
27
+ if isinstance(raw_data, dict):
28
+ return 0.9
29
+ if isinstance(raw_data, list):
30
+ if len(raw_data) > 0 and isinstance(raw_data[0], dict):
31
+ return 0.9
32
+ return 0.1
33
+
34
+ return 0.0
35
+
36
+ def _do_normalize(self, raw_data: Any) -> List[SayouBlock]:
20
37
  """
21
38
  Convert dict or list of dicts into record blocks.
22
39
 
@@ -24,7 +41,7 @@ class RecordNormalizer(BaseNormalizer):
24
41
  raw_data (Any): A Dictionary or a List of Dictionaries.
25
42
 
26
43
  Returns:
27
- List[ContentBlock]: Blocks of type 'record'.
44
+ List[SayouBlock]: Blocks of type 'record'.
28
45
  """
29
46
  blocks = []
30
47
 
@@ -49,17 +66,17 @@ class RecordNormalizer(BaseNormalizer):
49
66
 
50
67
  return blocks
51
68
 
52
- def _create_block(self, data: Dict[str, Any]) -> ContentBlock:
69
+ def _create_block(self, data: Dict[str, Any]) -> SayouBlock:
53
70
  """
54
- Helper to wrap a single dictionary into a ContentBlock.
71
+ Helper to wrap a single dictionary into a SayouBlock.
55
72
 
56
73
  Args:
57
74
  data (Dict[str, Any]): The data record.
58
75
 
59
76
  Returns:
60
- ContentBlock: A block with type='record' and content=data.
77
+ SayouBlock: A block with type='record' and content=data.
61
78
  """
62
- return ContentBlock(
79
+ return SayouBlock(
63
80
  type="record",
64
81
  content=data,
65
82
  metadata={"fields": list(data.keys())},