sayou-refinery 0.1.6__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sayou/refinery/__init__.py +21 -0
- sayou/refinery/core/exceptions.py +1 -1
- sayou/refinery/interfaces/base_normalizer.py +29 -8
- sayou/refinery/interfaces/base_processor.py +29 -9
- sayou/refinery/normalizer/doc_markdown_normalizer.py +107 -39
- sayou/refinery/normalizer/html_text_normalizer.py +36 -10
- sayou/refinery/normalizer/record_normalizer.py +26 -9
- sayou/refinery/pipeline.py +251 -63
- sayou/refinery/processor/deduplicator.py +14 -5
- sayou/refinery/processor/imputer.py +13 -4
- sayou/refinery/processor/outlier_handler.py +11 -4
- sayou/refinery/processor/pii_masker.py +11 -4
- sayou/refinery/processor/text_cleaner.py +13 -4
- {sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/METADATA +6 -6
- sayou_refinery-0.3.3.dist-info/RECORD +16 -0
- sayou/refinery/core/schemas.py +0 -27
- sayou_refinery-0.1.6.dist-info/RECORD +0 -16
- {sayou_refinery-0.1.6.dist-info → sayou_refinery-0.3.3.dist-info}/WHEEL +0 -0
sayou/refinery/core/schemas.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Union
|
|
2
|
-
|
|
3
|
-
from pydantic import BaseModel, Field
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class ContentBlock(BaseModel):
|
|
7
|
-
"""
|
|
8
|
-
Standard unit of content refined from raw data.
|
|
9
|
-
|
|
10
|
-
Refinery normalizes raw inputs into a list of these blocks.
|
|
11
|
-
Processors iterate over these blocks to clean or modify them.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
type: str = Field(
|
|
15
|
-
..., description="Block type (e.g., 'text', 'md', 'record', 'table')"
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
content: Union[str, Dict[str, Any], List[Any]] = Field(
|
|
19
|
-
..., description="The actual data payload"
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
metadata: Dict[str, Any] = Field(
|
|
23
|
-
default_factory=dict, description="Context info (page_num, source_id, etc.)"
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
class Config:
|
|
27
|
-
arbitrary_types_allowed = True
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
sayou/refinery/pipeline.py,sha256=oJbygy300ounS3xL3UdCpwnmdmUTRib-W-ADsPJ1Vjs,3756
|
|
2
|
-
sayou/refinery/core/exceptions.py,sha256=LhY8tDk9UzqjGjy-7UPpzBSRpH4vUl3ZemmW-BssdJY,547
|
|
3
|
-
sayou/refinery/core/schemas.py,sha256=LhKV5X8WIiUV273OpX9y7TduQUX03qZh-rgRSMn_eCs,727
|
|
4
|
-
sayou/refinery/interfaces/base_normalizer.py,sha256=nYQ40IM83WXnIiSIDqhWHfiHAC0O4F9iymb7-u7cSIE,1862
|
|
5
|
-
sayou/refinery/interfaces/base_processor.py,sha256=A9YvD1ZwHhlK290Y77xkSeJTtBCJ53wAYI-KeuVgShM,1650
|
|
6
|
-
sayou/refinery/normalizer/doc_markdown_normalizer.py,sha256=ZVkTEjCesrbjWRRetD1Lp6_YjHsdDL9GNXO4yvUYIUw,10277
|
|
7
|
-
sayou/refinery/normalizer/html_text_normalizer.py,sha256=hX0UTbJwND0Rv-_HuGL-p4Popdrg6_m_mDkKZDYW5AE,1675
|
|
8
|
-
sayou/refinery/normalizer/record_normalizer.py,sha256=bzErNEVw9g-QtQiq_wdm_AxiZi_uuvtx23AL8DRjgxQ,2029
|
|
9
|
-
sayou/refinery/processor/deduplicator.py,sha256=yKZkaPyY4P_a-IwIK8f3XCYqgnoF0us0NbMsBDY03ic,1402
|
|
10
|
-
sayou/refinery/processor/imputer.py,sha256=vvaGvxQNajKSjbYr-gNHmd4HYsD4FtchJhGfnKv-cpo,1562
|
|
11
|
-
sayou/refinery/processor/outlier_handler.py,sha256=bWcECxwVVvfcjiy5lm3YYABUA_7lIW_Do5Q70rK-mtM,2693
|
|
12
|
-
sayou/refinery/processor/pii_masker.py,sha256=fbXPE2HLESQFUvNITnLp-9q2L4MEPcfIkLUUGRIva8k,1726
|
|
13
|
-
sayou/refinery/processor/text_cleaner.py,sha256=8_Hu6H_W__tNfwebm9cS43DRc8EExkhfpkmxslShDjU,1748
|
|
14
|
-
sayou_refinery-0.1.6.dist-info/METADATA,sha256=zmor5IfNcoOzqqmX2OVwudo42b-43etk20LFf5r5wkg,16989
|
|
15
|
-
sayou_refinery-0.1.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
16
|
-
sayou_refinery-0.1.6.dist-info/RECORD,,
|
|
File without changes
|