data-prep-toolkit-transforms 0.2.1__tar.gz → 0.2.1.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_prep_toolkit_transforms-0.2.1.dev1/Makefile +77 -0
- data_prep_toolkit_transforms-0.2.1.dev1/PKG-INFO +67 -0
- data_prep_toolkit_transforms-0.2.1.dev1/README.md +33 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/pyproject.toml +28 -4
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code2parquet_transform.py +8 -4
- data_prep_toolkit_transforms-0.2.1.dev1/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +67 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/data_prep_toolkit_transforms.egg-info/SOURCES.txt +0 -13
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/data_prep_toolkit_transforms.egg-info/requires.txt +11 -11
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/data_prep_toolkit_transforms.egg-info/top_level.txt +0 -12
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_Gopher_statistics.py +1 -3
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_chunk_chunkers.py +6 -17
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_chunk_transform.py +1 -32
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_quality_local_python.py +3 -4
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_quality_transform.py +12 -20
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/ededup_local.py +5 -8
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/ededup_local_python.py +4 -6
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/ededup_transform_base.py +16 -111
- data_prep_toolkit_transforms-0.2.1.dev1/src/ededup_transform_python.py +69 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/filter_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/filter_transform.py +3 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/header_cleanser_local.py +5 -5
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/header_cleanser_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/header_cleanser_test_support.py +4 -4
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/header_cleanser_transform.py +63 -66
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/header_cleanser_transform_python.py +1 -1
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/lang_id_transform.py +4 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/nlp.py +4 -10
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/pdf2parquet_local.py +3 -7
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/pdf2parquet_local_python.py +5 -11
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/pdf2parquet_transform.py +29 -62
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/proglang_select_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/proglang_select_transform.py +3 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/resize_transform.py +2 -8
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/text_encoder_local.py +3 -1
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/text_encoder_local_python.py +6 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/tokenization_local_python.py +2 -2
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/tokenization_transform.py +3 -0
- data_prep_toolkit_transforms-0.2.1/Makefile +0 -62
- data_prep_toolkit_transforms-0.2.1/PKG-INFO +0 -73
- data_prep_toolkit_transforms-0.2.1/README.md +0 -39
- data_prep_toolkit_transforms-0.2.1/requirements.txt +0 -31
- data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +0 -73
- data_prep_toolkit_transforms-0.2.1/src/doc_id_local.py +0 -54
- data_prep_toolkit_transforms-0.2.1/src/doc_id_local_python.py +0 -52
- data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_base.py +0 -177
- data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_python.py +0 -120
- data_prep_toolkit_transforms-0.2.1/src/ededup_local_python_incremental.py +0 -53
- data_prep_toolkit_transforms-0.2.1/src/ededup_transform_python.py +0 -145
- data_prep_toolkit_transforms-0.2.1/src/flair_recognizer.py +0 -149
- data_prep_toolkit_transforms-0.2.1/src/pii_analyzer.py +0 -71
- data_prep_toolkit_transforms-0.2.1/src/pii_anonymizer.py +0 -27
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local.py +0 -37
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local_python.py +0 -37
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform.py +0 -152
- data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform_python.py +0 -35
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/setup.cfg +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/cc_net_prepro.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code2parquet_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code2parquet_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code2parquet_s3_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code2parquet_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code_quality_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code_quality_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code_quality_transform.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/code_quality_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/data_prep_toolkit_transforms.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_c4_statistics.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_chunk_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_chunk_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_chunk_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_quality_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_quality_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/doc_quality_utils.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/filter_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/filter_test_support.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/filter_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/lang_id_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/lang_id_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/lang_id_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/lang_models.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/pdf2parquet_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/proglang_select_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/proglang_select_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/resize_local.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/resize_local_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/resize_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/text_encoder_transform.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/text_encoder_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/tokenization_local_long_doc_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/tokenization_s3_long_doc_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/tokenization_transform_python.py +0 -0
- {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/src/tokenization_utils.py +0 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Define the root of the local git clone for the common rules to be able
|
|
2
|
+
# know where they are running from.
|
|
3
|
+
REPOROOT=../../..
|
|
4
|
+
# Include a library of common .transform.* targets which most
|
|
5
|
+
# transforms should be able to reuse. However, feel free
|
|
6
|
+
# to override/redefine the rules below.
|
|
7
|
+
|
|
8
|
+
# $(REPOROOT)/.make.versions file contains the versions
|
|
9
|
+
|
|
10
|
+
#TRANSFORM_NAME=doc_quality
|
|
11
|
+
|
|
12
|
+
include $(REPOROOT)/transforms/.make.transforms
|
|
13
|
+
|
|
14
|
+
TRANSFORMS_NAMES = code/code_quality \
|
|
15
|
+
code/code2parquet \
|
|
16
|
+
code/header_cleanser \
|
|
17
|
+
code/code_quality \
|
|
18
|
+
code/proglang_select \
|
|
19
|
+
language/doc_chunk \
|
|
20
|
+
language/doc_quality \
|
|
21
|
+
language/lang_id \
|
|
22
|
+
language/pdf2parquet \
|
|
23
|
+
language/text_encoder \
|
|
24
|
+
universal/ededup \
|
|
25
|
+
universal/filter \
|
|
26
|
+
universal/resize \
|
|
27
|
+
universal/tokenization
|
|
28
|
+
|
|
29
|
+
venv:
|
|
30
|
+
$(MAKE) .defaults.create-venv
|
|
31
|
+
source venv/bin/activate; \
|
|
32
|
+
$(PYTHON) -m pip install .
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
test:: setup venv test-src
|
|
36
|
+
|
|
37
|
+
clean:: .transforms.clean
|
|
38
|
+
-rm -fr src
|
|
39
|
+
|
|
40
|
+
image:: .transforms.python-image
|
|
41
|
+
|
|
42
|
+
test-src::
|
|
43
|
+
source venv/bin/activate; \
|
|
44
|
+
for T in $(TRANSFORMS_NAMES); do \
|
|
45
|
+
echo running unit test on: $$T ; \
|
|
46
|
+
$(PYTEST) $(REPOROOT)/transforms/$$T/python/test; \
|
|
47
|
+
done;
|
|
48
|
+
|
|
49
|
+
test-with-pypi:
|
|
50
|
+
$(MAKE) .defaults.create-venv
|
|
51
|
+
source venv/bin/activate; \
|
|
52
|
+
$(PYTHON) -m pip install data_prep_toolkit_transforms==0.2.1.dev0
|
|
53
|
+
$(MAKE) test-src
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
setup: .transforms.setup
|
|
57
|
+
$(MAKE) src
|
|
58
|
+
|
|
59
|
+
src:
|
|
60
|
+
for T in $(TRANSFORMS_NAMES); do \
|
|
61
|
+
echo copy src from $$T ; \
|
|
62
|
+
cp -R $(REPOROOT)/transforms/$$T/python/src/ src/ ; \
|
|
63
|
+
rm -fr *.egg-info ; \
|
|
64
|
+
rm -fr dist ; \
|
|
65
|
+
rm -fr build ; \
|
|
66
|
+
done;
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
build:: build-dist
|
|
70
|
+
|
|
71
|
+
publish:: publish-dist
|
|
72
|
+
|
|
73
|
+
build-dist:: setup .defaults.build-dist
|
|
74
|
+
|
|
75
|
+
publish-dist:: .defaults.publish-dist
|
|
76
|
+
|
|
77
|
+
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_toolkit_transforms
|
|
3
|
+
Version: 0.2.1.dev1
|
|
4
|
+
Summary: Data Preparation Toolkit Transforms
|
|
5
|
+
Author-email: Maroun Touma <touma@us.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: <3.12,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: data-prep-toolkit==0.2.1.dev0
|
|
11
|
+
Requires-Dist: argparse
|
|
12
|
+
Requires-Dist: boto3==1.34.69
|
|
13
|
+
Requires-Dist: bs4==0.0.2
|
|
14
|
+
Requires-Dist: clamd==1.0.2
|
|
15
|
+
Requires-Dist: docling[ocr]==1.1.2
|
|
16
|
+
Requires-Dist: duckdb==0.10.1
|
|
17
|
+
Requires-Dist: fasttext==0.9.2
|
|
18
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
19
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
|
|
20
|
+
Requires-Dist: langcodes==3.3.0
|
|
21
|
+
Requires-Dist: mmh3==4.1.0
|
|
22
|
+
Requires-Dist: numpy==1.26.4
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: parameterized
|
|
25
|
+
Requires-Dist: pyarrow==16.1.0
|
|
26
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
27
|
+
Requires-Dist: pytz>=2020.1
|
|
28
|
+
Requires-Dist: quackling==0.1.0
|
|
29
|
+
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
|
|
30
|
+
Requires-Dist: sentence-transformers==3.0.1
|
|
31
|
+
Requires-Dist: transformers==4.38.2
|
|
32
|
+
Requires-Dist: tzdata>=2022.7
|
|
33
|
+
Requires-Dist: xxhash==3.4.1
|
|
34
|
+
|
|
35
|
+
# DPK Python Transforms
|
|
36
|
+
|
|
37
|
+
## installation
|
|
38
|
+
|
|
39
|
+
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:
|
|
40
|
+
|
|
41
|
+
`python -m pip install data-prep-toolkit-transforms`
|
|
42
|
+
|
|
43
|
+
installing the python transforms will also install `data-prep-toolkit`
|
|
44
|
+
|
|
45
|
+
## List of Transforms in current package
|
|
46
|
+
|
|
47
|
+
* code
|
|
48
|
+
* [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md)
|
|
49
|
+
* header_cleanser (Not available on MacOS)
|
|
50
|
+
* code_quality
|
|
51
|
+
* proglang_select
|
|
52
|
+
* language
|
|
53
|
+
* doc_chunk
|
|
54
|
+
* *doc_quality
|
|
55
|
+
* lang_id
|
|
56
|
+
* pdf2parquet
|
|
57
|
+
* text_encoder
|
|
58
|
+
* universal
|
|
59
|
+
* ededup
|
|
60
|
+
* filter
|
|
61
|
+
* resize
|
|
62
|
+
* tokenization
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# DPK Python Transforms
|
|
2
|
+
|
|
3
|
+
## installation
|
|
4
|
+
|
|
5
|
+
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:
|
|
6
|
+
|
|
7
|
+
`python -m pip install data-prep-toolkit-transforms`
|
|
8
|
+
|
|
9
|
+
installing the python transforms will also install `data-prep-toolkit`
|
|
10
|
+
|
|
11
|
+
## List of Transforms in current package
|
|
12
|
+
|
|
13
|
+
* code
|
|
14
|
+
* [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md)
|
|
15
|
+
* header_cleanser (Not available on MacOS)
|
|
16
|
+
* code_quality
|
|
17
|
+
* proglang_select
|
|
18
|
+
* language
|
|
19
|
+
* doc_chunk
|
|
20
|
+
* *doc_quality
|
|
21
|
+
* lang_id
|
|
22
|
+
* pdf2parquet
|
|
23
|
+
* text_encoder
|
|
24
|
+
* universal
|
|
25
|
+
* ededup
|
|
26
|
+
* filter
|
|
27
|
+
* resize
|
|
28
|
+
* tokenization
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
{data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev1}/pyproject.toml
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_toolkit_transforms"
|
|
3
|
-
version = "0.2.1"
|
|
3
|
+
version = "0.2.1.dev1"
|
|
4
4
|
requires-python = ">=3.10,<3.12"
|
|
5
5
|
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
|
|
6
6
|
description = "Data Preparation Toolkit Transforms"
|
|
@@ -9,14 +9,38 @@ readme = {file = "README.md", content-type = "text/markdown"}
|
|
|
9
9
|
authors = [
|
|
10
10
|
{ name = "Maroun Touma", email = "touma@us.ibm.com" },
|
|
11
11
|
]
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
dependencies = [
|
|
14
|
+
"data-prep-toolkit==0.2.1.dev0",
|
|
15
|
+
"argparse",
|
|
16
|
+
"boto3==1.34.69",
|
|
17
|
+
"bs4==0.0.2",
|
|
18
|
+
"clamd==1.0.2",
|
|
19
|
+
"docling[ocr]==1.1.2",
|
|
20
|
+
"duckdb==0.10.1",
|
|
21
|
+
"fasttext==0.9.2",
|
|
22
|
+
"filetype >=1.2.0, <2.0.0",
|
|
23
|
+
"huggingface-hub >= 0.21.4, <1.0.0",
|
|
24
|
+
"langcodes==3.3.0",
|
|
25
|
+
"mmh3==4.1.0",
|
|
26
|
+
"numpy==1.26.4",
|
|
27
|
+
"pandas",
|
|
28
|
+
"parameterized",
|
|
29
|
+
"pyarrow==16.1.0",
|
|
30
|
+
"python-dateutil>=2.8.2",
|
|
31
|
+
"pytz>=2020.1",
|
|
32
|
+
"quackling==0.1.0",
|
|
33
|
+
"scancode-toolkit==32.1.0 ; platform_system != 'Darwin'",
|
|
34
|
+
"sentence-transformers==3.0.1",
|
|
35
|
+
"transformers==4.38.2",
|
|
36
|
+
"tzdata>=2022.7",
|
|
37
|
+
"xxhash==3.4.1",
|
|
38
|
+
]
|
|
13
39
|
|
|
14
40
|
[build-system]
|
|
15
41
|
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
|
|
16
42
|
build-backend = "setuptools.build_meta"
|
|
17
43
|
|
|
18
|
-
[tool.setuptools.dynamic]
|
|
19
|
-
dependencies = {file = ["requirements.txt"]}
|
|
20
44
|
|
|
21
45
|
[options]
|
|
22
46
|
package_dir = ["src"]
|
|
@@ -13,16 +13,20 @@
|
|
|
13
13
|
import io
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
|
-
import os
|
|
17
16
|
import uuid
|
|
18
17
|
import zipfile
|
|
19
18
|
from argparse import ArgumentParser, Namespace
|
|
20
19
|
from datetime import datetime
|
|
21
20
|
from typing import Any
|
|
21
|
+
import os
|
|
22
22
|
|
|
23
23
|
import pyarrow as pa
|
|
24
24
|
from data_processing.data_access import DataAccess, DataAccessFactory
|
|
25
|
-
from data_processing.transform import
|
|
25
|
+
from data_processing.transform import (
|
|
26
|
+
AbstractBinaryTransform,
|
|
27
|
+
AbstractTransform,
|
|
28
|
+
TransformConfiguration,
|
|
29
|
+
)
|
|
26
30
|
from data_processing.utils import CLIArgumentProvider, TransformUtils, str2bool
|
|
27
31
|
|
|
28
32
|
|
|
@@ -132,7 +136,7 @@ class CodeToParquetTransform(AbstractBinaryTransform):
|
|
|
132
136
|
"hash": TransformUtils.str_to_hash(content_string),
|
|
133
137
|
"size": len(content_string),
|
|
134
138
|
"date_acquired": datetime.now().isoformat(),
|
|
135
|
-
"repo_name":
|
|
139
|
+
"repo_name":os.path.splitext(os.path.basename(file_name))[0]
|
|
136
140
|
} | self.shared_columns
|
|
137
141
|
if self.detect_programming_lang:
|
|
138
142
|
lang = self._get_lang_from_ext(ext)
|
|
@@ -155,7 +159,7 @@ class CodeToParquetTransformConfiguration(TransformConfiguration):
|
|
|
155
159
|
configuration with CLI args and combining of metadata.
|
|
156
160
|
"""
|
|
157
161
|
|
|
158
|
-
def __init__(self, transform_class: type[
|
|
162
|
+
def __init__(self, transform_class: type[AbstractTransform] = CodeToParquetTransform):
|
|
159
163
|
super().__init__(
|
|
160
164
|
name=shortname,
|
|
161
165
|
transform_class=transform_class,
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_toolkit_transforms
|
|
3
|
+
Version: 0.2.1.dev1
|
|
4
|
+
Summary: Data Preparation Toolkit Transforms
|
|
5
|
+
Author-email: Maroun Touma <touma@us.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: <3.12,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: data-prep-toolkit==0.2.1.dev0
|
|
11
|
+
Requires-Dist: argparse
|
|
12
|
+
Requires-Dist: boto3==1.34.69
|
|
13
|
+
Requires-Dist: bs4==0.0.2
|
|
14
|
+
Requires-Dist: clamd==1.0.2
|
|
15
|
+
Requires-Dist: docling[ocr]==1.1.2
|
|
16
|
+
Requires-Dist: duckdb==0.10.1
|
|
17
|
+
Requires-Dist: fasttext==0.9.2
|
|
18
|
+
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
19
|
+
Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
|
|
20
|
+
Requires-Dist: langcodes==3.3.0
|
|
21
|
+
Requires-Dist: mmh3==4.1.0
|
|
22
|
+
Requires-Dist: numpy==1.26.4
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: parameterized
|
|
25
|
+
Requires-Dist: pyarrow==16.1.0
|
|
26
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
27
|
+
Requires-Dist: pytz>=2020.1
|
|
28
|
+
Requires-Dist: quackling==0.1.0
|
|
29
|
+
Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
|
|
30
|
+
Requires-Dist: sentence-transformers==3.0.1
|
|
31
|
+
Requires-Dist: transformers==4.38.2
|
|
32
|
+
Requires-Dist: tzdata>=2022.7
|
|
33
|
+
Requires-Dist: xxhash==3.4.1
|
|
34
|
+
|
|
35
|
+
# DPK Python Transforms
|
|
36
|
+
|
|
37
|
+
## installation
|
|
38
|
+
|
|
39
|
+
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:
|
|
40
|
+
|
|
41
|
+
`python -m pip install data-prep-toolkit-transforms`
|
|
42
|
+
|
|
43
|
+
installing the python transforms will also install `data-prep-toolkit`
|
|
44
|
+
|
|
45
|
+
## List of Transforms in current package
|
|
46
|
+
|
|
47
|
+
* code
|
|
48
|
+
* [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md)
|
|
49
|
+
* header_cleanser (Not available on MacOS)
|
|
50
|
+
* code_quality
|
|
51
|
+
* proglang_select
|
|
52
|
+
* language
|
|
53
|
+
* doc_chunk
|
|
54
|
+
* *doc_quality
|
|
55
|
+
* lang_id
|
|
56
|
+
* pdf2parquet
|
|
57
|
+
* text_encoder
|
|
58
|
+
* universal
|
|
59
|
+
* ededup
|
|
60
|
+
* filter
|
|
61
|
+
* resize
|
|
62
|
+
* tokenization
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
Makefile
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
|
-
requirements.txt
|
|
5
4
|
src/cc_net_prepro.py
|
|
6
5
|
src/code2parquet_local.py
|
|
7
6
|
src/code2parquet_local_python.py
|
|
@@ -19,10 +18,6 @@ src/doc_chunk_local.py
|
|
|
19
18
|
src/doc_chunk_local_python.py
|
|
20
19
|
src/doc_chunk_transform.py
|
|
21
20
|
src/doc_chunk_transform_python.py
|
|
22
|
-
src/doc_id_local.py
|
|
23
|
-
src/doc_id_local_python.py
|
|
24
|
-
src/doc_id_transform_base.py
|
|
25
|
-
src/doc_id_transform_python.py
|
|
26
21
|
src/doc_quality_local.py
|
|
27
22
|
src/doc_quality_local_python.py
|
|
28
23
|
src/doc_quality_transform.py
|
|
@@ -30,7 +25,6 @@ src/doc_quality_transform_python.py
|
|
|
30
25
|
src/doc_quality_utils.py
|
|
31
26
|
src/ededup_local.py
|
|
32
27
|
src/ededup_local_python.py
|
|
33
|
-
src/ededup_local_python_incremental.py
|
|
34
28
|
src/ededup_transform_base.py
|
|
35
29
|
src/ededup_transform_python.py
|
|
36
30
|
src/filter_local.py
|
|
@@ -38,7 +32,6 @@ src/filter_local_python.py
|
|
|
38
32
|
src/filter_test_support.py
|
|
39
33
|
src/filter_transform.py
|
|
40
34
|
src/filter_transform_python.py
|
|
41
|
-
src/flair_recognizer.py
|
|
42
35
|
src/header_cleanser_local.py
|
|
43
36
|
src/header_cleanser_local_python.py
|
|
44
37
|
src/header_cleanser_test_support.py
|
|
@@ -54,12 +47,6 @@ src/pdf2parquet_local.py
|
|
|
54
47
|
src/pdf2parquet_local_python.py
|
|
55
48
|
src/pdf2parquet_transform.py
|
|
56
49
|
src/pdf2parquet_transform_python.py
|
|
57
|
-
src/pii_analyzer.py
|
|
58
|
-
src/pii_anonymizer.py
|
|
59
|
-
src/pii_redactor_local.py
|
|
60
|
-
src/pii_redactor_local_python.py
|
|
61
|
-
src/pii_redactor_transform.py
|
|
62
|
-
src/pii_redactor_transform_python.py
|
|
63
50
|
src/proglang_select_local.py
|
|
64
51
|
src/proglang_select_local_python.py
|
|
65
52
|
src/proglang_select_transform.py
|
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
data-prep-toolkit
|
|
1
|
+
data-prep-toolkit==0.2.1.dev0
|
|
2
|
+
argparse
|
|
3
|
+
boto3==1.34.69
|
|
2
4
|
bs4==0.0.2
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
docling==1.11.0
|
|
6
|
-
filetype<2.0.0,>=1.2.0
|
|
7
|
-
docling-core==1.3.0
|
|
8
|
-
llama-index-core<0.12.0,>=0.11.0
|
|
5
|
+
clamd==1.0.2
|
|
6
|
+
docling[ocr]==1.1.2
|
|
9
7
|
duckdb==0.10.1
|
|
10
8
|
fasttext==0.9.2
|
|
9
|
+
filetype<2.0.0,>=1.2.0
|
|
11
10
|
huggingface-hub<1.0.0,>=0.21.4
|
|
12
11
|
langcodes==3.3.0
|
|
13
12
|
mmh3==4.1.0
|
|
14
13
|
numpy==1.26.4
|
|
15
14
|
pandas
|
|
16
15
|
parameterized
|
|
16
|
+
pyarrow==16.1.0
|
|
17
|
+
python-dateutil>=2.8.2
|
|
18
|
+
pytz>=2020.1
|
|
19
|
+
quackling==0.1.0
|
|
17
20
|
sentence-transformers==3.0.1
|
|
18
21
|
transformers==4.38.2
|
|
22
|
+
tzdata>=2022.7
|
|
19
23
|
xxhash==3.4.1
|
|
20
|
-
presidio-analyzer>=2.2.355
|
|
21
|
-
presidio-anonymizer>=2.2.355
|
|
22
|
-
flair>=0.14.0
|
|
23
|
-
pandas>=2.2.2
|
|
24
24
|
|
|
25
25
|
[:platform_system != "Darwin"]
|
|
26
26
|
scancode-toolkit==32.1.0
|
|
@@ -15,10 +15,6 @@ doc_chunk_local
|
|
|
15
15
|
doc_chunk_local_python
|
|
16
16
|
doc_chunk_transform
|
|
17
17
|
doc_chunk_transform_python
|
|
18
|
-
doc_id_local
|
|
19
|
-
doc_id_local_python
|
|
20
|
-
doc_id_transform_base
|
|
21
|
-
doc_id_transform_python
|
|
22
18
|
doc_quality_local
|
|
23
19
|
doc_quality_local_python
|
|
24
20
|
doc_quality_transform
|
|
@@ -26,7 +22,6 @@ doc_quality_transform_python
|
|
|
26
22
|
doc_quality_utils
|
|
27
23
|
ededup_local
|
|
28
24
|
ededup_local_python
|
|
29
|
-
ededup_local_python_incremental
|
|
30
25
|
ededup_transform_base
|
|
31
26
|
ededup_transform_python
|
|
32
27
|
filter_local
|
|
@@ -34,7 +29,6 @@ filter_local_python
|
|
|
34
29
|
filter_test_support
|
|
35
30
|
filter_transform
|
|
36
31
|
filter_transform_python
|
|
37
|
-
flair_recognizer
|
|
38
32
|
header_cleanser_local
|
|
39
33
|
header_cleanser_local_python
|
|
40
34
|
header_cleanser_test_support
|
|
@@ -50,12 +44,6 @@ pdf2parquet_local
|
|
|
50
44
|
pdf2parquet_local_python
|
|
51
45
|
pdf2parquet_transform
|
|
52
46
|
pdf2parquet_transform_python
|
|
53
|
-
pii_analyzer
|
|
54
|
-
pii_anonymizer
|
|
55
|
-
pii_redactor_local
|
|
56
|
-
pii_redactor_local_python
|
|
57
|
-
pii_redactor_transform
|
|
58
|
-
pii_redactor_transform_python
|
|
59
47
|
proglang_select_local
|
|
60
48
|
proglang_select_local_python
|
|
61
49
|
proglang_select_transform
|
|
@@ -49,9 +49,7 @@ def compute_word_statistics(text: str, symbols: list = ["#", "..."]) -> tuple[in
|
|
|
49
49
|
return total_words, mean_word_len, symbol_to_word_ratio
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def compute_bullet_point_ellipsis_alphabet_word_ratio(
|
|
53
|
-
text: str, bullets: list = ["-", "*"]
|
|
54
|
-
) -> tuple[float, float, float]:
|
|
52
|
+
def compute_bullet_point_ellipsis_alphabet_word_ratio(text: str, bullets: list = ["-", "*"]) -> tuple[float, float, float]:
|
|
55
53
|
"""
|
|
56
54
|
Given a text document:
|
|
57
55
|
- Compute the ratio of lines starting with a bullet point (should be <=90%)
|
|
@@ -10,13 +10,14 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
+
import math
|
|
13
14
|
from abc import ABCMeta, abstractmethod
|
|
14
|
-
from typing import Iterator
|
|
15
|
+
from typing import Iterator
|
|
15
16
|
|
|
16
17
|
from docling_core.types import Document as DLDocument
|
|
18
|
+
from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
|
|
17
19
|
from llama_index.core import Document as LIDocument
|
|
18
20
|
from llama_index.core.node_parser import MarkdownNodeParser
|
|
19
|
-
from docling_core.transforms.chunker import HierarchicalChunker
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class ChunkingExecutor(metaclass=ABCMeta):
|
|
@@ -24,25 +25,13 @@ class ChunkingExecutor(metaclass=ABCMeta):
|
|
|
24
25
|
def chunk(self, content: str) -> Iterator[dict]:
|
|
25
26
|
raise NotImplemented("The chunk() method must be implemented")
|
|
26
27
|
|
|
27
|
-
|
|
28
28
|
class DLJsonChunker(ChunkingExecutor):
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
min_chunk_len: Optional[int],
|
|
32
|
-
output_chunk_column_name: str,
|
|
33
|
-
output_jsonpath_column_name: str,
|
|
34
|
-
output_pageno_column_name_key: str,
|
|
35
|
-
output_bbox_column_name_key: str,
|
|
36
|
-
):
|
|
29
|
+
def __init__(self, output_chunk_column_name: str, output_jsonpath_column_name: str, output_pageno_column_name_key: str, output_bbox_column_name_key: str):
|
|
37
30
|
self.output_chunk_column_name = output_chunk_column_name
|
|
38
31
|
self.output_jsonpath_column_name = output_jsonpath_column_name
|
|
39
32
|
self.output_pageno_column_name_key = output_pageno_column_name_key
|
|
40
33
|
self.output_bbox_column_name_key = output_bbox_column_name_key
|
|
41
|
-
|
|
42
|
-
chunker_kwargs = dict(include_metadata=True)
|
|
43
|
-
if min_chunk_len is not None:
|
|
44
|
-
chunker_kwargs["min_chunk_len"] = min_chunk_len
|
|
45
|
-
self._chunker = HierarchicalChunker(**chunker_kwargs)
|
|
34
|
+
self._chunker = HierarchicalChunker(include_metadata=True)
|
|
46
35
|
|
|
47
36
|
def chunk(self, content: str) -> Iterator[dict]:
|
|
48
37
|
doc = DLDocument.model_validate_json(content)
|
|
@@ -54,7 +43,6 @@ class DLJsonChunker(ChunkingExecutor):
|
|
|
54
43
|
self.output_bbox_column_name_key: chunk.bbox,
|
|
55
44
|
}
|
|
56
45
|
|
|
57
|
-
|
|
58
46
|
class LIMarkdown(ChunkingExecutor):
|
|
59
47
|
def __init__(self, output_chunk_column_name: str):
|
|
60
48
|
self.output_chunk_column_name = output_chunk_column_name
|
|
@@ -66,3 +54,4 @@ class LIMarkdown(ChunkingExecutor):
|
|
|
66
54
|
yield {
|
|
67
55
|
self.output_chunk_column_name: node.text,
|
|
68
56
|
}
|
|
57
|
+
|
|
@@ -24,20 +24,14 @@ from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
|
|
|
24
24
|
short_name = "doc_chunk"
|
|
25
25
|
cli_prefix = f"{short_name}_"
|
|
26
26
|
content_column_name_key = "content_column_name"
|
|
27
|
-
doc_id_column_name_key = "doc_id_column_name"
|
|
28
27
|
chunking_type_key = "chunking_type"
|
|
29
|
-
dl_min_chunk_len_key = "dl_min_chunk_len"
|
|
30
28
|
output_chunk_column_name_key = "output_chunk_column_name"
|
|
31
|
-
output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
|
|
32
29
|
output_jsonpath_column_name_key = "output_jsonpath_column_name"
|
|
33
30
|
output_pageno_column_name_key = "output_pageno_column_name"
|
|
34
31
|
output_bbox_column_name_key = "output_bbox_column_name"
|
|
35
32
|
content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
|
|
36
|
-
doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
|
|
37
33
|
chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
|
|
38
|
-
dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
|
|
39
34
|
output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
|
|
40
|
-
output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
|
|
41
35
|
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
|
|
42
36
|
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
|
|
43
37
|
output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
|
|
@@ -52,11 +46,8 @@ class chunking_types(str, enum.Enum):
|
|
|
52
46
|
|
|
53
47
|
|
|
54
48
|
default_content_column_name = "contents"
|
|
55
|
-
default_doc_id_column_name = "document_id"
|
|
56
49
|
default_chunking_type = chunking_types.DL_JSON
|
|
57
|
-
default_dl_min_chunk_len = None
|
|
58
50
|
default_output_chunk_column_name = "contents"
|
|
59
|
-
default_output_source_doc_id_column_name = "source_document_id"
|
|
60
51
|
default_output_jsonpath_column_name = "doc_jsonpath"
|
|
61
52
|
default_output_pageno_column_name = "page_number"
|
|
62
53
|
default_output_bbox_column_name = "bbox"
|
|
@@ -82,12 +73,9 @@ class DocChunkTransform(AbstractTableTransform):
|
|
|
82
73
|
self.chunking_type = config.get(chunking_type_key, default_chunking_type)
|
|
83
74
|
|
|
84
75
|
self.content_column_name = config.get(content_column_name_key, default_content_column_name)
|
|
85
|
-
self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
|
|
86
76
|
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
|
|
87
|
-
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
|
|
88
77
|
|
|
89
78
|
# Parameters for Docling JSON chunking
|
|
90
|
-
self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
|
|
91
79
|
self.output_jsonpath_column_name = config.get(
|
|
92
80
|
output_jsonpath_column_name_key, default_output_jsonpath_column_name
|
|
93
81
|
)
|
|
@@ -101,7 +89,6 @@ class DocChunkTransform(AbstractTableTransform):
|
|
|
101
89
|
self.chunker: ChunkingExecutor
|
|
102
90
|
if self.chunking_type == chunking_types.DL_JSON:
|
|
103
91
|
self.chunker = DLJsonChunker(
|
|
104
|
-
min_chunk_len=self.dl_min_chunk_len,
|
|
105
92
|
output_chunk_column_name=self.output_chunk_column_name,
|
|
106
93
|
output_jsonpath_column_name=self.output_jsonpath_column_name,
|
|
107
94
|
output_pageno_column_name_key=self.output_pageno_column_name_key,
|
|
@@ -125,11 +112,8 @@ class DocChunkTransform(AbstractTableTransform):
|
|
|
125
112
|
for batch in table.to_batches():
|
|
126
113
|
for row in batch.to_pylist():
|
|
127
114
|
content: str = row[self.content_column_name]
|
|
128
|
-
new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,
|
|
129
|
-
if self.doc_id_column_name in row:
|
|
130
|
-
new_row[self.output_source_doc_id_column_name] = row[self.doc_id_column_name]
|
|
115
|
+
new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,)}
|
|
131
116
|
for chunk in self.chunker.chunk(content):
|
|
132
|
-
chunk[self.doc_id_column_name] = TransformUtils.str_to_hash(chunk[self.output_chunk_column_name])
|
|
133
117
|
data.append(
|
|
134
118
|
{
|
|
135
119
|
**new_row,
|
|
@@ -178,26 +162,11 @@ class DocChunkTransformConfiguration(TransformConfiguration):
|
|
|
178
162
|
default=default_content_column_name,
|
|
179
163
|
help="Name of the column containing the text to be chunked",
|
|
180
164
|
)
|
|
181
|
-
parser.add_argument(
|
|
182
|
-
f"--{doc_id_column_name_cli_param}",
|
|
183
|
-
default=default_doc_id_column_name,
|
|
184
|
-
help="Name of the column containing the doc_id to be propagated in the output",
|
|
185
|
-
)
|
|
186
|
-
parser.add_argument(
|
|
187
|
-
f"--{dl_min_chunk_len_cli_param}",
|
|
188
|
-
default=default_dl_min_chunk_len,
|
|
189
|
-
help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
|
|
190
|
-
)
|
|
191
165
|
parser.add_argument(
|
|
192
166
|
f"--{output_chunk_column_name_cli_param}",
|
|
193
167
|
default=default_output_chunk_column_name,
|
|
194
168
|
help="Column name to store the chunks",
|
|
195
169
|
)
|
|
196
|
-
parser.add_argument(
|
|
197
|
-
f"--{output_source_doc_id_column_name_cli_param}",
|
|
198
|
-
default=default_output_source_doc_id_column_name,
|
|
199
|
-
help="Column name to store the `document_id` from the input table",
|
|
200
|
-
)
|
|
201
170
|
parser.add_argument(
|
|
202
171
|
f"--{output_jsonpath_column_name_cli_param}",
|
|
203
172
|
default=default_output_jsonpath_column_name,
|
|
@@ -16,13 +16,12 @@ import sys
|
|
|
16
16
|
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
17
17
|
from data_processing.utils import ParamsUtils
|
|
18
18
|
from doc_quality_transform import (
|
|
19
|
-
bad_word_filepath_cli_param,
|
|
20
|
-
doc_content_column_cli_param,
|
|
21
19
|
text_lang_cli_param,
|
|
20
|
+
doc_content_column_cli_param,
|
|
21
|
+
bad_word_filepath_cli_param,
|
|
22
22
|
)
|
|
23
23
|
from doc_quality_transform_python import DocQualityPythonTransformConfiguration
|
|
24
24
|
|
|
25
|
-
|
|
26
25
|
# create parameters
|
|
27
26
|
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
|
|
28
27
|
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
|
|
@@ -32,7 +31,7 @@ local_conf = {
|
|
|
32
31
|
}
|
|
33
32
|
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
|
|
34
33
|
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
|
|
35
|
-
model_path
|
|
34
|
+
model_path=os.path.join(basedir, "models")
|
|
36
35
|
if not os.path.exists(model_path):
|
|
37
36
|
model_path = os.path.abspath(os.path.join(basedir, "..", "models"))
|
|
38
37
|
|