data-prep-toolkit-transforms 0.2.1__tar.gz → 0.2.1.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_prep_toolkit_transforms-0.2.1.dev0/PKG-INFO +33 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/pyproject.toml +28 -4
- data_prep_toolkit_transforms-0.2.1.dev0/src/__init__.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_transform.py +8 -4
- data_prep_toolkit_transforms-0.2.1.dev0/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +33 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/SOURCES.txt +1 -15
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/requires.txt +11 -11
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/top_level.txt +1 -12
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_Gopher_statistics.py +1 -3
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_chunkers.py +6 -17
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_transform.py +1 -32
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_local_python.py +3 -4
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_transform.py +12 -20
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_local.py +5 -8
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_local_python.py +4 -6
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_transform_base.py +16 -111
- data_prep_toolkit_transforms-0.2.1.dev0/src/ededup_transform_python.py +69 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_transform.py +3 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_local.py +5 -5
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_test_support.py +4 -4
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_transform.py +63 -66
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_transform_python.py +1 -1
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_transform.py +4 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/nlp.py +4 -10
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_local.py +3 -7
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_local_python.py +5 -11
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_transform.py +29 -62
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_transform.py +3 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_transform.py +2 -8
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_local.py +3 -1
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_local_python.py +6 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_transform.py +3 -0
- data_prep_toolkit_transforms-0.2.1/Makefile +0 -62
- data_prep_toolkit_transforms-0.2.1/PKG-INFO +0 -73
- data_prep_toolkit_transforms-0.2.1/README.md +0 -39
- data_prep_toolkit_transforms-0.2.1/requirements.txt +0 -31
- data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +0 -73
- data_prep_toolkit_transforms-0.2.1/src/doc_id_local.py +0 -54
- data_prep_toolkit_transforms-0.2.1/src/doc_id_local_python.py +0 -52
- data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_base.py +0 -177
- data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_python.py +0 -120
- data_prep_toolkit_transforms-0.2.1/src/ededup_local_python_incremental.py +0 -53
- data_prep_toolkit_transforms-0.2.1/src/ededup_transform_python.py +0 -145
- data_prep_toolkit_transforms-0.2.1/src/flair_recognizer.py +0 -149
- data_prep_toolkit_transforms-0.2.1/src/pii_analyzer.py +0 -71
- data_prep_toolkit_transforms-0.2.1/src/pii_anonymizer.py +0 -27
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local.py +0 -37
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local_python.py +0 -37
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform.py +0 -152
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform_python.py +0 -35
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/setup.cfg +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/cc_net_prepro.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_s3_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_transform.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_c4_statistics.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_utils.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_test_support.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_models.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_transform.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_local_long_doc_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_s3_long_doc_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_utils.py +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_toolkit_transforms
|
|
3
|
+
Version: 0.2.1.dev0
|
|
4
|
+
Summary: Data Preparation Toolkit Transforms
|
|
5
|
+
Author-email: Maroun Touma <touma@us.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: <3.12,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: data-prep-toolkit==0.2.1.dev0
|
|
11
|
+
Requires-Dist: argparse
|
|
12
|
+
Requires-Dist: boto3==1.34.69
|
|
13
|
+
Requires-Dist: bs4==0.0.2
|
|
14
|
+
Requires-Dist: clamd==1.0.2
|
|
15
|
+
Requires-Dist: docling[ocr]==1.1.2
|
|
16
|
+
Requires-Dist: duckdb==0.10.1
|
|
17
|
+
Requires-Dist: fasttext==0.9.2
|
|
18
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
19
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
|
|
20
|
+
Requires-Dist: langcodes==3.3.0
|
|
21
|
+
Requires-Dist: mmh3==4.1.0
|
|
22
|
+
Requires-Dist: numpy==1.26.4
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: parameterized
|
|
25
|
+
Requires-Dist: pyarrow==16.1.0
|
|
26
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
27
|
+
Requires-Dist: pytz>=2020.1
|
|
28
|
+
Requires-Dist: quackling==0.1.0
|
|
29
|
+
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
|
|
30
|
+
Requires-Dist: sentence-transformers==3.0.1
|
|
31
|
+
Requires-Dist: transformers==4.38.2
|
|
32
|
+
Requires-Dist: tzdata>=2022.7
|
|
33
|
+
Requires-Dist: xxhash==3.4.1
|
{data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/pyproject.toml
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_toolkit_transforms"
|
|
3
|
-
version = "0.2.1"
|
|
3
|
+
version = "0.2.1.dev0"
|
|
4
4
|
requires-python = ">=3.10,<3.12"
|
|
5
5
|
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
|
|
6
6
|
description = "Data Preparation Toolkit Transforms"
|
|
@@ -9,14 +9,38 @@ readme = {file = "README.md", content-type = "text/markdown"}
|
|
|
9
9
|
authors = [
|
|
10
10
|
{ name = "Maroun Touma", email = "touma@us.ibm.com" },
|
|
11
11
|
]
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
dependencies = [
|
|
14
|
+
"data-prep-toolkit==0.2.1.dev0",
|
|
15
|
+
"argparse",
|
|
16
|
+
"boto3==1.34.69",
|
|
17
|
+
"bs4==0.0.2",
|
|
18
|
+
"clamd==1.0.2",
|
|
19
|
+
"docling[ocr]==1.1.2",
|
|
20
|
+
"duckdb==0.10.1",
|
|
21
|
+
"fasttext==0.9.2",
|
|
22
|
+
"filetype >=1.2.0, <2.0.0",
|
|
23
|
+
"huggingface-hub >= 0.21.4, <1.0.0",
|
|
24
|
+
"langcodes==3.3.0",
|
|
25
|
+
"mmh3==4.1.0",
|
|
26
|
+
"numpy==1.26.4",
|
|
27
|
+
"pandas",
|
|
28
|
+
"parameterized",
|
|
29
|
+
"pyarrow==16.1.0",
|
|
30
|
+
"python-dateutil>=2.8.2",
|
|
31
|
+
"pytz>=2020.1",
|
|
32
|
+
"quackling==0.1.0",
|
|
33
|
+
"scancode-toolkit==32.1.0 ; platform_system != 'Darwin'",
|
|
34
|
+
"sentence-transformers==3.0.1",
|
|
35
|
+
"transformers==4.38.2",
|
|
36
|
+
"tzdata>=2022.7",
|
|
37
|
+
"xxhash==3.4.1",
|
|
38
|
+
]
|
|
13
39
|
|
|
14
40
|
[build-system]
|
|
15
41
|
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
|
|
16
42
|
build-backend = "setuptools.build_meta"
|
|
17
43
|
|
|
18
|
-
[tool.setuptools.dynamic]
|
|
19
|
-
dependencies = {file = ["requirements.txt"]}
|
|
20
44
|
|
|
21
45
|
[options]
|
|
22
46
|
package_dir = ["src"]
|
|
File without changes
|
|
@@ -13,16 +13,20 @@
|
|
|
13
13
|
import io
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
|
-
import os
|
|
17
16
|
import uuid
|
|
18
17
|
import zipfile
|
|
19
18
|
from argparse import ArgumentParser, Namespace
|
|
20
19
|
from datetime import datetime
|
|
21
20
|
from typing import Any
|
|
21
|
+
import os
|
|
22
22
|
|
|
23
23
|
import pyarrow as pa
|
|
24
24
|
from data_processing.data_access import DataAccess, DataAccessFactory
|
|
25
|
-
from data_processing.transform import
|
|
25
|
+
from data_processing.transform import (
|
|
26
|
+
AbstractBinaryTransform,
|
|
27
|
+
AbstractTransform,
|
|
28
|
+
TransformConfiguration,
|
|
29
|
+
)
|
|
26
30
|
from data_processing.utils import CLIArgumentProvider, TransformUtils, str2bool
|
|
27
31
|
|
|
28
32
|
|
|
@@ -132,7 +136,7 @@ class CodeToParquetTransform(AbstractBinaryTransform):
|
|
|
132
136
|
"hash": TransformUtils.str_to_hash(content_string),
|
|
133
137
|
"size": len(content_string),
|
|
134
138
|
"date_acquired": datetime.now().isoformat(),
|
|
135
|
-
"repo_name":
|
|
139
|
+
"repo_name":os.path.splitext(os.path.basename(file_name))[0]
|
|
136
140
|
} | self.shared_columns
|
|
137
141
|
if self.detect_programming_lang:
|
|
138
142
|
lang = self._get_lang_from_ext(ext)
|
|
@@ -155,7 +159,7 @@ class CodeToParquetTransformConfiguration(TransformConfiguration):
|
|
|
155
159
|
configuration with CLI args and combining of metadata.
|
|
156
160
|
"""
|
|
157
161
|
|
|
158
|
-
def __init__(self, transform_class: type[
|
|
162
|
+
def __init__(self, transform_class: type[AbstractTransform] = CodeToParquetTransform):
|
|
159
163
|
super().__init__(
|
|
160
164
|
name=shortname,
|
|
161
165
|
transform_class=transform_class,
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_toolkit_transforms
|
|
3
|
+
Version: 0.2.1.dev0
|
|
4
|
+
Summary: Data Preparation Toolkit Transforms
|
|
5
|
+
Author-email: Maroun Touma <touma@us.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: <3.12,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: data-prep-toolkit==0.2.1.dev0
|
|
11
|
+
Requires-Dist: argparse
|
|
12
|
+
Requires-Dist: boto3==1.34.69
|
|
13
|
+
Requires-Dist: bs4==0.0.2
|
|
14
|
+
Requires-Dist: clamd==1.0.2
|
|
15
|
+
Requires-Dist: docling[ocr]==1.1.2
|
|
16
|
+
Requires-Dist: duckdb==0.10.1
|
|
17
|
+
Requires-Dist: fasttext==0.9.2
|
|
18
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
19
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
|
|
20
|
+
Requires-Dist: langcodes==3.3.0
|
|
21
|
+
Requires-Dist: mmh3==4.1.0
|
|
22
|
+
Requires-Dist: numpy==1.26.4
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: parameterized
|
|
25
|
+
Requires-Dist: pyarrow==16.1.0
|
|
26
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
27
|
+
Requires-Dist: pytz>=2020.1
|
|
28
|
+
Requires-Dist: quackling==0.1.0
|
|
29
|
+
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
|
|
30
|
+
Requires-Dist: sentence-transformers==3.0.1
|
|
31
|
+
Requires-Dist: transformers==4.38.2
|
|
32
|
+
Requires-Dist: tzdata>=2022.7
|
|
33
|
+
Requires-Dist: xxhash==3.4.1
|
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
Makefile
|
|
2
|
-
README.md
|
|
3
1
|
pyproject.toml
|
|
4
|
-
|
|
2
|
+
src/__init__.py
|
|
5
3
|
src/cc_net_prepro.py
|
|
6
4
|
src/code2parquet_local.py
|
|
7
5
|
src/code2parquet_local_python.py
|
|
@@ -19,10 +17,6 @@ src/doc_chunk_local.py
|
|
|
19
17
|
src/doc_chunk_local_python.py
|
|
20
18
|
src/doc_chunk_transform.py
|
|
21
19
|
src/doc_chunk_transform_python.py
|
|
22
|
-
src/doc_id_local.py
|
|
23
|
-
src/doc_id_local_python.py
|
|
24
|
-
src/doc_id_transform_base.py
|
|
25
|
-
src/doc_id_transform_python.py
|
|
26
20
|
src/doc_quality_local.py
|
|
27
21
|
src/doc_quality_local_python.py
|
|
28
22
|
src/doc_quality_transform.py
|
|
@@ -30,7 +24,6 @@ src/doc_quality_transform_python.py
|
|
|
30
24
|
src/doc_quality_utils.py
|
|
31
25
|
src/ededup_local.py
|
|
32
26
|
src/ededup_local_python.py
|
|
33
|
-
src/ededup_local_python_incremental.py
|
|
34
27
|
src/ededup_transform_base.py
|
|
35
28
|
src/ededup_transform_python.py
|
|
36
29
|
src/filter_local.py
|
|
@@ -38,7 +31,6 @@ src/filter_local_python.py
|
|
|
38
31
|
src/filter_test_support.py
|
|
39
32
|
src/filter_transform.py
|
|
40
33
|
src/filter_transform_python.py
|
|
41
|
-
src/flair_recognizer.py
|
|
42
34
|
src/header_cleanser_local.py
|
|
43
35
|
src/header_cleanser_local_python.py
|
|
44
36
|
src/header_cleanser_test_support.py
|
|
@@ -54,12 +46,6 @@ src/pdf2parquet_local.py
|
|
|
54
46
|
src/pdf2parquet_local_python.py
|
|
55
47
|
src/pdf2parquet_transform.py
|
|
56
48
|
src/pdf2parquet_transform_python.py
|
|
57
|
-
src/pii_analyzer.py
|
|
58
|
-
src/pii_anonymizer.py
|
|
59
|
-
src/pii_redactor_local.py
|
|
60
|
-
src/pii_redactor_local_python.py
|
|
61
|
-
src/pii_redactor_transform.py
|
|
62
|
-
src/pii_redactor_transform_python.py
|
|
63
49
|
src/proglang_select_local.py
|
|
64
50
|
src/proglang_select_local_python.py
|
|
65
51
|
src/proglang_select_transform.py
|
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
data-prep-toolkit
|
|
1
|
+
data-prep-toolkit==0.2.1.dev0
|
|
2
|
+
argparse
|
|
3
|
+
boto3==1.34.69
|
|
2
4
|
bs4==0.0.2
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
docling==1.11.0
|
|
6
|
-
filetype<2.0.0,>=1.2.0
|
|
7
|
-
docling-core==1.3.0
|
|
8
|
-
llama-index-core<0.12.0,>=0.11.0
|
|
5
|
+
clamd==1.0.2
|
|
6
|
+
docling[ocr]==1.1.2
|
|
9
7
|
duckdb==0.10.1
|
|
10
8
|
fasttext==0.9.2
|
|
9
|
+
filetype<2.0.0,>=1.2.0
|
|
11
10
|
huggingface-hub<1.0.0,>=0.21.4
|
|
12
11
|
langcodes==3.3.0
|
|
13
12
|
mmh3==4.1.0
|
|
14
13
|
numpy==1.26.4
|
|
15
14
|
pandas
|
|
16
15
|
parameterized
|
|
16
|
+
pyarrow==16.1.0
|
|
17
|
+
python-dateutil>=2.8.2
|
|
18
|
+
pytz>=2020.1
|
|
19
|
+
quackling==0.1.0
|
|
17
20
|
sentence-transformers==3.0.1
|
|
18
21
|
transformers==4.38.2
|
|
22
|
+
tzdata>=2022.7
|
|
19
23
|
xxhash==3.4.1
|
|
20
|
-
presidio-analyzer>=2.2.355
|
|
21
|
-
presidio-anonymizer>=2.2.355
|
|
22
|
-
flair>=0.14.0
|
|
23
|
-
pandas>=2.2.2
|
|
24
24
|
|
|
25
25
|
[:platform_system != "Darwin"]
|
|
26
26
|
scancode-toolkit==32.1.0
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
__init__
|
|
1
2
|
cc_net_prepro
|
|
2
3
|
code2parquet_local
|
|
3
4
|
code2parquet_local_python
|
|
@@ -15,10 +16,6 @@ doc_chunk_local
|
|
|
15
16
|
doc_chunk_local_python
|
|
16
17
|
doc_chunk_transform
|
|
17
18
|
doc_chunk_transform_python
|
|
18
|
-
doc_id_local
|
|
19
|
-
doc_id_local_python
|
|
20
|
-
doc_id_transform_base
|
|
21
|
-
doc_id_transform_python
|
|
22
19
|
doc_quality_local
|
|
23
20
|
doc_quality_local_python
|
|
24
21
|
doc_quality_transform
|
|
@@ -26,7 +23,6 @@ doc_quality_transform_python
|
|
|
26
23
|
doc_quality_utils
|
|
27
24
|
ededup_local
|
|
28
25
|
ededup_local_python
|
|
29
|
-
ededup_local_python_incremental
|
|
30
26
|
ededup_transform_base
|
|
31
27
|
ededup_transform_python
|
|
32
28
|
filter_local
|
|
@@ -34,7 +30,6 @@ filter_local_python
|
|
|
34
30
|
filter_test_support
|
|
35
31
|
filter_transform
|
|
36
32
|
filter_transform_python
|
|
37
|
-
flair_recognizer
|
|
38
33
|
header_cleanser_local
|
|
39
34
|
header_cleanser_local_python
|
|
40
35
|
header_cleanser_test_support
|
|
@@ -50,12 +45,6 @@ pdf2parquet_local
|
|
|
50
45
|
pdf2parquet_local_python
|
|
51
46
|
pdf2parquet_transform
|
|
52
47
|
pdf2parquet_transform_python
|
|
53
|
-
pii_analyzer
|
|
54
|
-
pii_anonymizer
|
|
55
|
-
pii_redactor_local
|
|
56
|
-
pii_redactor_local_python
|
|
57
|
-
pii_redactor_transform
|
|
58
|
-
pii_redactor_transform_python
|
|
59
48
|
proglang_select_local
|
|
60
49
|
proglang_select_local_python
|
|
61
50
|
proglang_select_transform
|
|
@@ -49,9 +49,7 @@ def compute_word_statistics(text: str, symbols: list = ["#", "..."]) -> tuple[in
|
|
|
49
49
|
return total_words, mean_word_len, symbol_to_word_ratio
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def compute_bullet_point_ellipsis_alphabet_word_ratio(
|
|
53
|
-
text: str, bullets: list = ["-", "*"]
|
|
54
|
-
) -> tuple[float, float, float]:
|
|
52
|
+
def compute_bullet_point_ellipsis_alphabet_word_ratio(text: str, bullets: list = ["-", "*"]) -> tuple[float, float, float]:
|
|
55
53
|
"""
|
|
56
54
|
Given a text document:
|
|
57
55
|
- Compute the ratio of lines starting with a bullet point (should be <=90%)
|
|
@@ -10,13 +10,14 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
+
import math
|
|
13
14
|
from abc import ABCMeta, abstractmethod
|
|
14
|
-
from typing import Iterator
|
|
15
|
+
from typing import Iterator
|
|
15
16
|
|
|
16
17
|
from docling_core.types import Document as DLDocument
|
|
18
|
+
from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
|
|
17
19
|
from llama_index.core import Document as LIDocument
|
|
18
20
|
from llama_index.core.node_parser import MarkdownNodeParser
|
|
19
|
-
from docling_core.transforms.chunker import HierarchicalChunker
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class ChunkingExecutor(metaclass=ABCMeta):
|
|
@@ -24,25 +25,13 @@ class ChunkingExecutor(metaclass=ABCMeta):
|
|
|
24
25
|
def chunk(self, content: str) -> Iterator[dict]:
|
|
25
26
|
raise NotImplemented("The chunk() method must be implemented")
|
|
26
27
|
|
|
27
|
-
|
|
28
28
|
class DLJsonChunker(ChunkingExecutor):
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
min_chunk_len: Optional[int],
|
|
32
|
-
output_chunk_column_name: str,
|
|
33
|
-
output_jsonpath_column_name: str,
|
|
34
|
-
output_pageno_column_name_key: str,
|
|
35
|
-
output_bbox_column_name_key: str,
|
|
36
|
-
):
|
|
29
|
+
def __init__(self, output_chunk_column_name: str, output_jsonpath_column_name: str, output_pageno_column_name_key: str, output_bbox_column_name_key: str):
|
|
37
30
|
self.output_chunk_column_name = output_chunk_column_name
|
|
38
31
|
self.output_jsonpath_column_name = output_jsonpath_column_name
|
|
39
32
|
self.output_pageno_column_name_key = output_pageno_column_name_key
|
|
40
33
|
self.output_bbox_column_name_key = output_bbox_column_name_key
|
|
41
|
-
|
|
42
|
-
chunker_kwargs = dict(include_metadata=True)
|
|
43
|
-
if min_chunk_len is not None:
|
|
44
|
-
chunker_kwargs["min_chunk_len"] = min_chunk_len
|
|
45
|
-
self._chunker = HierarchicalChunker(**chunker_kwargs)
|
|
34
|
+
self._chunker = HierarchicalChunker(include_metadata=True)
|
|
46
35
|
|
|
47
36
|
def chunk(self, content: str) -> Iterator[dict]:
|
|
48
37
|
doc = DLDocument.model_validate_json(content)
|
|
@@ -54,7 +43,6 @@ class DLJsonChunker(ChunkingExecutor):
|
|
|
54
43
|
self.output_bbox_column_name_key: chunk.bbox,
|
|
55
44
|
}
|
|
56
45
|
|
|
57
|
-
|
|
58
46
|
class LIMarkdown(ChunkingExecutor):
|
|
59
47
|
def __init__(self, output_chunk_column_name: str):
|
|
60
48
|
self.output_chunk_column_name = output_chunk_column_name
|
|
@@ -66,3 +54,4 @@ class LIMarkdown(ChunkingExecutor):
|
|
|
66
54
|
yield {
|
|
67
55
|
self.output_chunk_column_name: node.text,
|
|
68
56
|
}
|
|
57
|
+
|
|
@@ -24,20 +24,14 @@ from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
|
|
|
24
24
|
short_name = "doc_chunk"
|
|
25
25
|
cli_prefix = f"{short_name}_"
|
|
26
26
|
content_column_name_key = "content_column_name"
|
|
27
|
-
doc_id_column_name_key = "doc_id_column_name"
|
|
28
27
|
chunking_type_key = "chunking_type"
|
|
29
|
-
dl_min_chunk_len_key = "dl_min_chunk_len"
|
|
30
28
|
output_chunk_column_name_key = "output_chunk_column_name"
|
|
31
|
-
output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
|
|
32
29
|
output_jsonpath_column_name_key = "output_jsonpath_column_name"
|
|
33
30
|
output_pageno_column_name_key = "output_pageno_column_name"
|
|
34
31
|
output_bbox_column_name_key = "output_bbox_column_name"
|
|
35
32
|
content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
|
|
36
|
-
doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
|
|
37
33
|
chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
|
|
38
|
-
dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
|
|
39
34
|
output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
|
|
40
|
-
output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
|
|
41
35
|
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
|
|
42
36
|
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
|
|
43
37
|
output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
|
|
@@ -52,11 +46,8 @@ class chunking_types(str, enum.Enum):
|
|
|
52
46
|
|
|
53
47
|
|
|
54
48
|
default_content_column_name = "contents"
|
|
55
|
-
default_doc_id_column_name = "document_id"
|
|
56
49
|
default_chunking_type = chunking_types.DL_JSON
|
|
57
|
-
default_dl_min_chunk_len = None
|
|
58
50
|
default_output_chunk_column_name = "contents"
|
|
59
|
-
default_output_source_doc_id_column_name = "source_document_id"
|
|
60
51
|
default_output_jsonpath_column_name = "doc_jsonpath"
|
|
61
52
|
default_output_pageno_column_name = "page_number"
|
|
62
53
|
default_output_bbox_column_name = "bbox"
|
|
@@ -82,12 +73,9 @@ class DocChunkTransform(AbstractTableTransform):
|
|
|
82
73
|
self.chunking_type = config.get(chunking_type_key, default_chunking_type)
|
|
83
74
|
|
|
84
75
|
self.content_column_name = config.get(content_column_name_key, default_content_column_name)
|
|
85
|
-
self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
|
|
86
76
|
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
|
|
87
|
-
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
|
|
88
77
|
|
|
89
78
|
# Parameters for Docling JSON chunking
|
|
90
|
-
self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
|
|
91
79
|
self.output_jsonpath_column_name = config.get(
|
|
92
80
|
output_jsonpath_column_name_key, default_output_jsonpath_column_name
|
|
93
81
|
)
|
|
@@ -101,7 +89,6 @@ class DocChunkTransform(AbstractTableTransform):
|
|
|
101
89
|
self.chunker: ChunkingExecutor
|
|
102
90
|
if self.chunking_type == chunking_types.DL_JSON:
|
|
103
91
|
self.chunker = DLJsonChunker(
|
|
104
|
-
min_chunk_len=self.dl_min_chunk_len,
|
|
105
92
|
output_chunk_column_name=self.output_chunk_column_name,
|
|
106
93
|
output_jsonpath_column_name=self.output_jsonpath_column_name,
|
|
107
94
|
output_pageno_column_name_key=self.output_pageno_column_name_key,
|
|
@@ -125,11 +112,8 @@ class DocChunkTransform(AbstractTableTransform):
|
|
|
125
112
|
for batch in table.to_batches():
|
|
126
113
|
for row in batch.to_pylist():
|
|
127
114
|
content: str = row[self.content_column_name]
|
|
128
|
-
new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,
|
|
129
|
-
if self.doc_id_column_name in row:
|
|
130
|
-
new_row[self.output_source_doc_id_column_name] = row[self.doc_id_column_name]
|
|
115
|
+
new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,)}
|
|
131
116
|
for chunk in self.chunker.chunk(content):
|
|
132
|
-
chunk[self.doc_id_column_name] = TransformUtils.str_to_hash(chunk[self.output_chunk_column_name])
|
|
133
117
|
data.append(
|
|
134
118
|
{
|
|
135
119
|
**new_row,
|
|
@@ -178,26 +162,11 @@ class DocChunkTransformConfiguration(TransformConfiguration):
|
|
|
178
162
|
default=default_content_column_name,
|
|
179
163
|
help="Name of the column containing the text to be chunked",
|
|
180
164
|
)
|
|
181
|
-
parser.add_argument(
|
|
182
|
-
f"--{doc_id_column_name_cli_param}",
|
|
183
|
-
default=default_doc_id_column_name,
|
|
184
|
-
help="Name of the column containing the doc_id to be propagated in the output",
|
|
185
|
-
)
|
|
186
|
-
parser.add_argument(
|
|
187
|
-
f"--{dl_min_chunk_len_cli_param}",
|
|
188
|
-
default=default_dl_min_chunk_len,
|
|
189
|
-
help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
|
|
190
|
-
)
|
|
191
165
|
parser.add_argument(
|
|
192
166
|
f"--{output_chunk_column_name_cli_param}",
|
|
193
167
|
default=default_output_chunk_column_name,
|
|
194
168
|
help="Column name to store the chunks",
|
|
195
169
|
)
|
|
196
|
-
parser.add_argument(
|
|
197
|
-
f"--{output_source_doc_id_column_name_cli_param}",
|
|
198
|
-
default=default_output_source_doc_id_column_name,
|
|
199
|
-
help="Column name to store the `document_id` from the input table",
|
|
200
|
-
)
|
|
201
170
|
parser.add_argument(
|
|
202
171
|
f"--{output_jsonpath_column_name_cli_param}",
|
|
203
172
|
default=default_output_jsonpath_column_name,
|
|
@@ -16,13 +16,12 @@ import sys
|
|
|
16
16
|
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
17
17
|
from data_processing.utils import ParamsUtils
|
|
18
18
|
from doc_quality_transform import (
|
|
19
|
-
bad_word_filepath_cli_param,
|
|
20
|
-
doc_content_column_cli_param,
|
|
21
19
|
text_lang_cli_param,
|
|
20
|
+
doc_content_column_cli_param,
|
|
21
|
+
bad_word_filepath_cli_param,
|
|
22
22
|
)
|
|
23
23
|
from doc_quality_transform_python import DocQualityPythonTransformConfiguration
|
|
24
24
|
|
|
25
|
-
|
|
26
25
|
# create parameters
|
|
27
26
|
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
|
|
28
27
|
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
|
|
@@ -32,7 +31,7 @@ local_conf = {
|
|
|
32
31
|
}
|
|
33
32
|
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
|
|
34
33
|
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
|
|
35
|
-
model_path
|
|
34
|
+
model_path=os.path.join(basedir, "models")
|
|
36
35
|
if not os.path.exists(model_path):
|
|
37
36
|
model_path = os.path.abspath(os.path.join(basedir, "..", "models"))
|
|
38
37
|
|
|
@@ -10,12 +10,12 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
-
import os
|
|
14
13
|
from argparse import ArgumentParser, Namespace
|
|
15
14
|
from typing import Any
|
|
16
15
|
|
|
16
|
+
import os
|
|
17
17
|
import pyarrow as pa
|
|
18
|
-
from data_processing.data_access import
|
|
18
|
+
from data_processing.data_access import DataAccessFactory, DataAccess
|
|
19
19
|
from data_processing.transform import AbstractTableTransform, TransformConfiguration
|
|
20
20
|
from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
|
|
21
21
|
from doc_c4_statistics import (
|
|
@@ -32,7 +32,6 @@ from doc_Gopher_statistics import (
|
|
|
32
32
|
find_first_japanese_alphabet_position,
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
|
|
36
35
|
logger = get_logger(__name__)
|
|
37
36
|
|
|
38
37
|
short_name = "docq"
|
|
@@ -50,7 +49,6 @@ default_doc_content_column = "contents"
|
|
|
50
49
|
data_factory_internal_key = f"{cli_prefix}data_factory"
|
|
51
50
|
files_to_use_internal_key = f"{cli_prefix}files_to_use"
|
|
52
51
|
|
|
53
|
-
|
|
54
52
|
class DocQualityTransform(AbstractTableTransform):
|
|
55
53
|
"""
|
|
56
54
|
Implements a transform to calculate document quality.
|
|
@@ -67,7 +65,7 @@ class DocQualityTransform(AbstractTableTransform):
|
|
|
67
65
|
super().__init__(config)
|
|
68
66
|
self.text_lang = config.get(text_lang_key, default_text_lang)
|
|
69
67
|
self.doc_content_column = config.get(doc_content_column_key, default_doc_content_column)
|
|
70
|
-
|
|
68
|
+
|
|
71
69
|
daf = config.get(data_factory_internal_key, None)
|
|
72
70
|
bad_word_filepath = config.get(bad_word_filepath_key, None)
|
|
73
71
|
if bad_word_filepath is not None:
|
|
@@ -75,14 +73,11 @@ class DocQualityTransform(AbstractTableTransform):
|
|
|
75
73
|
logger.info(f"Load badwords found locally from {bad_word_filepath}")
|
|
76
74
|
self.re_pattern = c4_load_ldnoobw_words(ft_lang=self.text_lang, file_path=bad_word_filepath)
|
|
77
75
|
else:
|
|
78
|
-
if daf is None:
|
|
79
|
-
raise RuntimeError(
|
|
80
|
-
f"Did not find DataAccessFactory instance under {data_factory_internal_key} key. This is required when bad word file is not in the local file system."
|
|
81
|
-
)
|
|
76
|
+
if daf is None:
|
|
77
|
+
raise RuntimeError(f"Did not find DataAccessFactory instance under {data_factory_internal_key} key. This is required when bad word file is not in the local file system.")
|
|
82
78
|
logger.info(f"Load badwords from remote")
|
|
83
79
|
data_access = daf.create_data_access()
|
|
84
80
|
import tempfile
|
|
85
|
-
|
|
86
81
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
87
82
|
# use a temporary directory until model is loaded to memory
|
|
88
83
|
bad_word_filepath = self._write_locally(data_access, bad_word_filepath, temp_dir)
|
|
@@ -92,7 +87,7 @@ class DocQualityTransform(AbstractTableTransform):
|
|
|
92
87
|
filename = os.path.basename(path)
|
|
93
88
|
content, _ = data_access.get_file(path)
|
|
94
89
|
temp_file_path = os.path.join(temp_dir, filename)
|
|
95
|
-
with open(temp_file_path,
|
|
90
|
+
with open(temp_file_path, 'wb') as temp_file:
|
|
96
91
|
temp_file.write(content)
|
|
97
92
|
return temp_file_path
|
|
98
93
|
|
|
@@ -190,7 +185,6 @@ class DocQualityTransformConfiguration(TransformConfiguration):
|
|
|
190
185
|
Provides support for configuring and using the associated Transform class include
|
|
191
186
|
configuration with CLI args.
|
|
192
187
|
"""
|
|
193
|
-
|
|
194
188
|
def __init__(self):
|
|
195
189
|
super().__init__(
|
|
196
190
|
name=short_name,
|
|
@@ -207,7 +201,9 @@ class DocQualityTransformConfiguration(TransformConfiguration):
|
|
|
207
201
|
(e.g, noop_, pii_, etc.)
|
|
208
202
|
"""
|
|
209
203
|
parser.add_argument(
|
|
210
|
-
f"--{text_lang_cli_param}",
|
|
204
|
+
f"--{text_lang_cli_param}",
|
|
205
|
+
default=default_text_lang,
|
|
206
|
+
help="language used in the text content"
|
|
211
207
|
)
|
|
212
208
|
parser.add_argument(
|
|
213
209
|
f"--{doc_content_column_cli_param}",
|
|
@@ -229,13 +225,9 @@ class DocQualityTransformConfiguration(TransformConfiguration):
|
|
|
229
225
|
:return: True, if validate pass or False otherwise
|
|
230
226
|
"""
|
|
231
227
|
captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
|
|
232
|
-
self.params =
|
|
233
|
-
self.
|
|
234
|
-
|
|
235
|
-
| {
|
|
236
|
-
data_factory_internal_key: self.daf,
|
|
237
|
-
}
|
|
238
|
-
)
|
|
228
|
+
self.params = self.params | captured | {
|
|
229
|
+
data_factory_internal_key: self.daf,
|
|
230
|
+
}
|
|
239
231
|
logger.info(f"doc_quality parameters are : {self.params}")
|
|
240
232
|
# Validate and populate the transform's DataAccessFactory
|
|
241
233
|
return self.daf.apply_input_params(args)
|
{data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_local.py
RENAMED
|
@@ -13,10 +13,7 @@
|
|
|
13
13
|
import os
|
|
14
14
|
|
|
15
15
|
from data_processing.data_access import DataAccessLocal
|
|
16
|
-
from
|
|
17
|
-
from ededup_transform_python import EdedupTransform
|
|
18
|
-
from ededup_transform_base import doc_column_name_key, int_column_name_key
|
|
19
|
-
|
|
16
|
+
from ededup_transform_python import EdedupPythonTransform
|
|
20
17
|
|
|
21
18
|
# create parameters
|
|
22
19
|
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
|
|
@@ -26,18 +23,18 @@ local_conf = {
|
|
|
26
23
|
"output_folder": output_folder,
|
|
27
24
|
}
|
|
28
25
|
|
|
29
|
-
ededup_params = {
|
|
26
|
+
ededup_params = {"doc_column": "contents"}
|
|
30
27
|
|
|
31
28
|
if __name__ == "__main__":
|
|
32
29
|
# Here we show how to run outside of ray
|
|
33
30
|
# Filter transform needs a DataAccess to ready the domain list.
|
|
34
31
|
data_access = DataAccessLocal(local_conf)
|
|
35
32
|
# Create and configure the transform.
|
|
36
|
-
transform =
|
|
33
|
+
transform = EdedupPythonTransform(ededup_params)
|
|
37
34
|
# Use the local data access to read a parquet table.
|
|
38
35
|
table, _ = data_access.get_table(os.path.join(input_folder, "sample1.parquet"))
|
|
39
|
-
print(f"input table has {table.num_rows} rows
|
|
36
|
+
print(f"input table has {table.num_rows} rows")
|
|
40
37
|
# Transform the table
|
|
41
38
|
table_list, metadata = transform.transform(table)
|
|
42
|
-
print(f"\noutput table has {table_list[0].num_rows} rows
|
|
39
|
+
print(f"\noutput table has {table_list[0].num_rows} rows")
|
|
43
40
|
print(f"output metadata : {metadata}")
|
|
@@ -13,14 +13,13 @@
|
|
|
13
13
|
import os
|
|
14
14
|
import sys
|
|
15
15
|
|
|
16
|
-
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
17
16
|
from data_processing.utils import ParamsUtils
|
|
18
|
-
from
|
|
19
|
-
from
|
|
17
|
+
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
18
|
+
from ededup_transform_python import EdedupPythonTransformConfiguration
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
# create launcher
|
|
23
|
-
launcher = PythonTransformLauncher(
|
|
22
|
+
launcher = PythonTransformLauncher(EdedupPythonTransformConfiguration())
|
|
24
23
|
# create parameters
|
|
25
24
|
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
|
|
26
25
|
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
|
|
@@ -37,8 +36,7 @@ params = {
|
|
|
37
36
|
"runtime_job_id": "job_id",
|
|
38
37
|
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
|
|
39
38
|
# ededup parameters
|
|
40
|
-
|
|
41
|
-
int_column_name_cli_param: "document_id",
|
|
39
|
+
"ededup_doc_column": "contents",
|
|
42
40
|
}
|
|
43
41
|
sys.argv = ParamsUtils.dict_to_req(d=params)
|
|
44
42
|
|