data-prep-toolkit-transforms 0.2.1__tar.gz → 0.2.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data_prep_toolkit_transforms-0.2.1.dev0/PKG-INFO +33 -0
  2. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/pyproject.toml +28 -4
  3. data_prep_toolkit_transforms-0.2.1.dev0/src/__init__.py +0 -0
  4. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_transform.py +8 -4
  5. data_prep_toolkit_transforms-0.2.1.dev0/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +33 -0
  6. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/SOURCES.txt +1 -15
  7. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/requires.txt +11 -11
  8. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/top_level.txt +1 -12
  9. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_Gopher_statistics.py +1 -3
  10. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_chunkers.py +6 -17
  11. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_transform.py +1 -32
  12. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_local_python.py +3 -4
  13. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_transform.py +12 -20
  14. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_local.py +5 -8
  15. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_local_python.py +4 -6
  16. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/ededup_transform_base.py +16 -111
  17. data_prep_toolkit_transforms-0.2.1.dev0/src/ededup_transform_python.py +69 -0
  18. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_local_python.py +2 -2
  19. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_transform.py +3 -0
  20. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_local.py +5 -5
  21. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_local_python.py +2 -2
  22. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_test_support.py +4 -4
  23. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_transform.py +63 -66
  24. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/header_cleanser_transform_python.py +1 -1
  25. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_transform.py +4 -2
  26. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/nlp.py +4 -10
  27. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_local.py +3 -7
  28. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_local_python.py +5 -11
  29. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_transform.py +29 -62
  30. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_local_python.py +2 -2
  31. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_transform.py +3 -0
  32. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_transform.py +2 -8
  33. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_local.py +3 -1
  34. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_local_python.py +6 -2
  35. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_local_python.py +2 -2
  36. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_transform.py +3 -0
  37. data_prep_toolkit_transforms-0.2.1/Makefile +0 -62
  38. data_prep_toolkit_transforms-0.2.1/PKG-INFO +0 -73
  39. data_prep_toolkit_transforms-0.2.1/README.md +0 -39
  40. data_prep_toolkit_transforms-0.2.1/requirements.txt +0 -31
  41. data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +0 -73
  42. data_prep_toolkit_transforms-0.2.1/src/doc_id_local.py +0 -54
  43. data_prep_toolkit_transforms-0.2.1/src/doc_id_local_python.py +0 -52
  44. data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_base.py +0 -177
  45. data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_python.py +0 -120
  46. data_prep_toolkit_transforms-0.2.1/src/ededup_local_python_incremental.py +0 -53
  47. data_prep_toolkit_transforms-0.2.1/src/ededup_transform_python.py +0 -145
  48. data_prep_toolkit_transforms-0.2.1/src/flair_recognizer.py +0 -149
  49. data_prep_toolkit_transforms-0.2.1/src/pii_analyzer.py +0 -71
  50. data_prep_toolkit_transforms-0.2.1/src/pii_anonymizer.py +0 -27
  51. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local.py +0 -37
  52. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local_python.py +0 -37
  53. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform.py +0 -152
  54. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform_python.py +0 -35
  55. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/setup.cfg +0 -0
  56. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/cc_net_prepro.py +0 -0
  57. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_local.py +0 -0
  58. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_local_python.py +0 -0
  59. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_s3_python.py +0 -0
  60. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code2parquet_transform_python.py +0 -0
  61. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_local.py +0 -0
  62. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_local_python.py +0 -0
  63. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_transform.py +0 -0
  64. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/code_quality_transform_python.py +0 -0
  65. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/data_prep_toolkit_transforms.egg-info/dependency_links.txt +0 -0
  66. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_c4_statistics.py +0 -0
  67. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_local.py +0 -0
  68. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_local_python.py +0 -0
  69. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_chunk_transform_python.py +0 -0
  70. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_local.py +0 -0
  71. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_transform_python.py +0 -0
  72. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/doc_quality_utils.py +0 -0
  73. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_local.py +0 -0
  74. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_test_support.py +0 -0
  75. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/filter_transform_python.py +0 -0
  76. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_local.py +0 -0
  77. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_local_python.py +0 -0
  78. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_id_transform_python.py +0 -0
  79. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/lang_models.py +0 -0
  80. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/pdf2parquet_transform_python.py +0 -0
  81. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_local.py +0 -0
  82. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/proglang_select_transform_python.py +0 -0
  83. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_local.py +0 -0
  84. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_local_python.py +0 -0
  85. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/resize_transform_python.py +0 -0
  86. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_transform.py +0 -0
  87. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/text_encoder_transform_python.py +0 -0
  88. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_local_long_doc_python.py +0 -0
  89. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_s3_long_doc_python.py +0 -0
  90. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_transform_python.py +0 -0
  91. {data_prep_toolkit_transforms-0.2.1 → data_prep_toolkit_transforms-0.2.1.dev0}/src/tokenization_utils.py +0 -0
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_toolkit_transforms
3
+ Version: 0.2.1.dev0
4
+ Summary: Data Preparation Toolkit Transforms
5
+ Author-email: Maroun Touma <touma@us.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: <3.12,>=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: data-prep-toolkit==0.2.1.dev0
11
+ Requires-Dist: argparse
12
+ Requires-Dist: boto3==1.34.69
13
+ Requires-Dist: bs4==0.0.2
14
+ Requires-Dist: clamd==1.0.2
15
+ Requires-Dist: docling[ocr]==1.1.2
16
+ Requires-Dist: duckdb==0.10.1
17
+ Requires-Dist: fasttext==0.9.2
18
+ Requires-Dist: filetype<2.0.0,>=1.2.0
19
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
20
+ Requires-Dist: langcodes==3.3.0
21
+ Requires-Dist: mmh3==4.1.0
22
+ Requires-Dist: numpy==1.26.4
23
+ Requires-Dist: pandas
24
+ Requires-Dist: parameterized
25
+ Requires-Dist: pyarrow==16.1.0
26
+ Requires-Dist: python-dateutil>=2.8.2
27
+ Requires-Dist: pytz>=2020.1
28
+ Requires-Dist: quackling==0.1.0
29
+ Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
30
+ Requires-Dist: sentence-transformers==3.0.1
31
+ Requires-Dist: transformers==4.38.2
32
+ Requires-Dist: tzdata>=2022.7
33
+ Requires-Dist: xxhash==3.4.1
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit_transforms"
3
- version = "0.2.1"
3
+ version = "0.2.1.dev0"
4
4
  requires-python = ">=3.10,<3.12"
5
5
  keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
6
6
  description = "Data Preparation Toolkit Transforms"
@@ -9,14 +9,38 @@ readme = {file = "README.md", content-type = "text/markdown"}
9
9
  authors = [
10
10
  { name = "Maroun Touma", email = "touma@us.ibm.com" },
11
11
  ]
12
- dynamic = ["dependencies"]
12
+
13
+ dependencies = [
14
+ "data-prep-toolkit==0.2.1.dev0",
15
+ "argparse",
16
+ "boto3==1.34.69",
17
+ "bs4==0.0.2",
18
+ "clamd==1.0.2",
19
+ "docling[ocr]==1.1.2",
20
+ "duckdb==0.10.1",
21
+ "fasttext==0.9.2",
22
+ "filetype >=1.2.0, <2.0.0",
23
+ "huggingface-hub >= 0.21.4, <1.0.0",
24
+ "langcodes==3.3.0",
25
+ "mmh3==4.1.0",
26
+ "numpy==1.26.4",
27
+ "pandas",
28
+ "parameterized",
29
+ "pyarrow==16.1.0",
30
+ "python-dateutil>=2.8.2",
31
+ "pytz>=2020.1",
32
+ "quackling==0.1.0",
33
+ "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'",
34
+ "sentence-transformers==3.0.1",
35
+ "transformers==4.38.2",
36
+ "tzdata>=2022.7",
37
+ "xxhash==3.4.1",
38
+ ]
13
39
 
14
40
  [build-system]
15
41
  requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
16
42
  build-backend = "setuptools.build_meta"
17
43
 
18
- [tool.setuptools.dynamic]
19
- dependencies = {file = ["requirements.txt"]}
20
44
 
21
45
  [options]
22
46
  package_dir = ["src"]
@@ -13,16 +13,20 @@
13
13
  import io
14
14
  import json
15
15
  import logging
16
- import os
17
16
  import uuid
18
17
  import zipfile
19
18
  from argparse import ArgumentParser, Namespace
20
19
  from datetime import datetime
21
20
  from typing import Any
21
+ import os
22
22
 
23
23
  import pyarrow as pa
24
24
  from data_processing.data_access import DataAccess, DataAccessFactory
25
- from data_processing.transform import AbstractBinaryTransform, TransformConfiguration
25
+ from data_processing.transform import (
26
+ AbstractBinaryTransform,
27
+ AbstractTransform,
28
+ TransformConfiguration,
29
+ )
26
30
  from data_processing.utils import CLIArgumentProvider, TransformUtils, str2bool
27
31
 
28
32
 
@@ -132,7 +136,7 @@ class CodeToParquetTransform(AbstractBinaryTransform):
132
136
  "hash": TransformUtils.str_to_hash(content_string),
133
137
  "size": len(content_string),
134
138
  "date_acquired": datetime.now().isoformat(),
135
- "repo_name": os.path.splitext(os.path.basename(file_name))[0],
139
+ "repo_name":os.path.splitext(os.path.basename(file_name))[0]
136
140
  } | self.shared_columns
137
141
  if self.detect_programming_lang:
138
142
  lang = self._get_lang_from_ext(ext)
@@ -155,7 +159,7 @@ class CodeToParquetTransformConfiguration(TransformConfiguration):
155
159
  configuration with CLI args and combining of metadata.
156
160
  """
157
161
 
158
- def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeToParquetTransform):
162
+ def __init__(self, transform_class: type[AbstractTransform] = CodeToParquetTransform):
159
163
  super().__init__(
160
164
  name=shortname,
161
165
  transform_class=transform_class,
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_toolkit_transforms
3
+ Version: 0.2.1.dev0
4
+ Summary: Data Preparation Toolkit Transforms
5
+ Author-email: Maroun Touma <touma@us.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: <3.12,>=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: data-prep-toolkit==0.2.1.dev0
11
+ Requires-Dist: argparse
12
+ Requires-Dist: boto3==1.34.69
13
+ Requires-Dist: bs4==0.0.2
14
+ Requires-Dist: clamd==1.0.2
15
+ Requires-Dist: docling[ocr]==1.1.2
16
+ Requires-Dist: duckdb==0.10.1
17
+ Requires-Dist: fasttext==0.9.2
18
+ Requires-Dist: filetype<2.0.0,>=1.2.0
19
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
20
+ Requires-Dist: langcodes==3.3.0
21
+ Requires-Dist: mmh3==4.1.0
22
+ Requires-Dist: numpy==1.26.4
23
+ Requires-Dist: pandas
24
+ Requires-Dist: parameterized
25
+ Requires-Dist: pyarrow==16.1.0
26
+ Requires-Dist: python-dateutil>=2.8.2
27
+ Requires-Dist: pytz>=2020.1
28
+ Requires-Dist: quackling==0.1.0
29
+ Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
30
+ Requires-Dist: sentence-transformers==3.0.1
31
+ Requires-Dist: transformers==4.38.2
32
+ Requires-Dist: tzdata>=2022.7
33
+ Requires-Dist: xxhash==3.4.1
@@ -1,7 +1,5 @@
1
- Makefile
2
- README.md
3
1
  pyproject.toml
4
- requirements.txt
2
+ src/__init__.py
5
3
  src/cc_net_prepro.py
6
4
  src/code2parquet_local.py
7
5
  src/code2parquet_local_python.py
@@ -19,10 +17,6 @@ src/doc_chunk_local.py
19
17
  src/doc_chunk_local_python.py
20
18
  src/doc_chunk_transform.py
21
19
  src/doc_chunk_transform_python.py
22
- src/doc_id_local.py
23
- src/doc_id_local_python.py
24
- src/doc_id_transform_base.py
25
- src/doc_id_transform_python.py
26
20
  src/doc_quality_local.py
27
21
  src/doc_quality_local_python.py
28
22
  src/doc_quality_transform.py
@@ -30,7 +24,6 @@ src/doc_quality_transform_python.py
30
24
  src/doc_quality_utils.py
31
25
  src/ededup_local.py
32
26
  src/ededup_local_python.py
33
- src/ededup_local_python_incremental.py
34
27
  src/ededup_transform_base.py
35
28
  src/ededup_transform_python.py
36
29
  src/filter_local.py
@@ -38,7 +31,6 @@ src/filter_local_python.py
38
31
  src/filter_test_support.py
39
32
  src/filter_transform.py
40
33
  src/filter_transform_python.py
41
- src/flair_recognizer.py
42
34
  src/header_cleanser_local.py
43
35
  src/header_cleanser_local_python.py
44
36
  src/header_cleanser_test_support.py
@@ -54,12 +46,6 @@ src/pdf2parquet_local.py
54
46
  src/pdf2parquet_local_python.py
55
47
  src/pdf2parquet_transform.py
56
48
  src/pdf2parquet_transform_python.py
57
- src/pii_analyzer.py
58
- src/pii_anonymizer.py
59
- src/pii_redactor_local.py
60
- src/pii_redactor_local_python.py
61
- src/pii_redactor_transform.py
62
- src/pii_redactor_transform_python.py
63
49
  src/proglang_select_local.py
64
50
  src/proglang_select_local_python.py
65
51
  src/proglang_select_transform.py
@@ -1,26 +1,26 @@
1
- data-prep-toolkit>=0.2.1
1
+ data-prep-toolkit==0.2.1.dev0
2
+ argparse
3
+ boto3==1.34.69
2
4
  bs4==0.0.2
3
- docling-ibm-models==1.1.7
4
- deepsearch-glm==0.21.0
5
- docling==1.11.0
6
- filetype<2.0.0,>=1.2.0
7
- docling-core==1.3.0
8
- llama-index-core<0.12.0,>=0.11.0
5
+ clamd==1.0.2
6
+ docling[ocr]==1.1.2
9
7
  duckdb==0.10.1
10
8
  fasttext==0.9.2
9
+ filetype<2.0.0,>=1.2.0
11
10
  huggingface-hub<1.0.0,>=0.21.4
12
11
  langcodes==3.3.0
13
12
  mmh3==4.1.0
14
13
  numpy==1.26.4
15
14
  pandas
16
15
  parameterized
16
+ pyarrow==16.1.0
17
+ python-dateutil>=2.8.2
18
+ pytz>=2020.1
19
+ quackling==0.1.0
17
20
  sentence-transformers==3.0.1
18
21
  transformers==4.38.2
22
+ tzdata>=2022.7
19
23
  xxhash==3.4.1
20
- presidio-analyzer>=2.2.355
21
- presidio-anonymizer>=2.2.355
22
- flair>=0.14.0
23
- pandas>=2.2.2
24
24
 
25
25
  [:platform_system != "Darwin"]
26
26
  scancode-toolkit==32.1.0
@@ -1,3 +1,4 @@
1
+ __init__
1
2
  cc_net_prepro
2
3
  code2parquet_local
3
4
  code2parquet_local_python
@@ -15,10 +16,6 @@ doc_chunk_local
15
16
  doc_chunk_local_python
16
17
  doc_chunk_transform
17
18
  doc_chunk_transform_python
18
- doc_id_local
19
- doc_id_local_python
20
- doc_id_transform_base
21
- doc_id_transform_python
22
19
  doc_quality_local
23
20
  doc_quality_local_python
24
21
  doc_quality_transform
@@ -26,7 +23,6 @@ doc_quality_transform_python
26
23
  doc_quality_utils
27
24
  ededup_local
28
25
  ededup_local_python
29
- ededup_local_python_incremental
30
26
  ededup_transform_base
31
27
  ededup_transform_python
32
28
  filter_local
@@ -34,7 +30,6 @@ filter_local_python
34
30
  filter_test_support
35
31
  filter_transform
36
32
  filter_transform_python
37
- flair_recognizer
38
33
  header_cleanser_local
39
34
  header_cleanser_local_python
40
35
  header_cleanser_test_support
@@ -50,12 +45,6 @@ pdf2parquet_local
50
45
  pdf2parquet_local_python
51
46
  pdf2parquet_transform
52
47
  pdf2parquet_transform_python
53
- pii_analyzer
54
- pii_anonymizer
55
- pii_redactor_local
56
- pii_redactor_local_python
57
- pii_redactor_transform
58
- pii_redactor_transform_python
59
48
  proglang_select_local
60
49
  proglang_select_local_python
61
50
  proglang_select_transform
@@ -49,9 +49,7 @@ def compute_word_statistics(text: str, symbols: list = ["#", "..."]) -> tuple[in
49
49
  return total_words, mean_word_len, symbol_to_word_ratio
50
50
 
51
51
 
52
- def compute_bullet_point_ellipsis_alphabet_word_ratio(
53
- text: str, bullets: list = ["-", "*"]
54
- ) -> tuple[float, float, float]:
52
+ def compute_bullet_point_ellipsis_alphabet_word_ratio(text: str, bullets: list = ["-", "*"]) -> tuple[float, float, float]:
55
53
  """
56
54
  Given a text document:
57
55
  - Compute the ratio of lines starting with a bullet point (should be <=90%)
@@ -10,13 +10,14 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
+ import math
13
14
  from abc import ABCMeta, abstractmethod
14
- from typing import Iterator, Optional
15
+ from typing import Iterator
15
16
 
16
17
  from docling_core.types import Document as DLDocument
18
+ from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
17
19
  from llama_index.core import Document as LIDocument
18
20
  from llama_index.core.node_parser import MarkdownNodeParser
19
- from docling_core.transforms.chunker import HierarchicalChunker
20
21
 
21
22
 
22
23
  class ChunkingExecutor(metaclass=ABCMeta):
@@ -24,25 +25,13 @@ class ChunkingExecutor(metaclass=ABCMeta):
24
25
  def chunk(self, content: str) -> Iterator[dict]:
25
26
  raise NotImplemented("The chunk() method must be implemented")
26
27
 
27
-
28
28
  class DLJsonChunker(ChunkingExecutor):
29
- def __init__(
30
- self,
31
- min_chunk_len: Optional[int],
32
- output_chunk_column_name: str,
33
- output_jsonpath_column_name: str,
34
- output_pageno_column_name_key: str,
35
- output_bbox_column_name_key: str,
36
- ):
29
+ def __init__(self, output_chunk_column_name: str, output_jsonpath_column_name: str, output_pageno_column_name_key: str, output_bbox_column_name_key: str):
37
30
  self.output_chunk_column_name = output_chunk_column_name
38
31
  self.output_jsonpath_column_name = output_jsonpath_column_name
39
32
  self.output_pageno_column_name_key = output_pageno_column_name_key
40
33
  self.output_bbox_column_name_key = output_bbox_column_name_key
41
-
42
- chunker_kwargs = dict(include_metadata=True)
43
- if min_chunk_len is not None:
44
- chunker_kwargs["min_chunk_len"] = min_chunk_len
45
- self._chunker = HierarchicalChunker(**chunker_kwargs)
34
+ self._chunker = HierarchicalChunker(include_metadata=True)
46
35
 
47
36
  def chunk(self, content: str) -> Iterator[dict]:
48
37
  doc = DLDocument.model_validate_json(content)
@@ -54,7 +43,6 @@ class DLJsonChunker(ChunkingExecutor):
54
43
  self.output_bbox_column_name_key: chunk.bbox,
55
44
  }
56
45
 
57
-
58
46
  class LIMarkdown(ChunkingExecutor):
59
47
  def __init__(self, output_chunk_column_name: str):
60
48
  self.output_chunk_column_name = output_chunk_column_name
@@ -66,3 +54,4 @@ class LIMarkdown(ChunkingExecutor):
66
54
  yield {
67
55
  self.output_chunk_column_name: node.text,
68
56
  }
57
+
@@ -24,20 +24,14 @@ from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
24
24
  short_name = "doc_chunk"
25
25
  cli_prefix = f"{short_name}_"
26
26
  content_column_name_key = "content_column_name"
27
- doc_id_column_name_key = "doc_id_column_name"
28
27
  chunking_type_key = "chunking_type"
29
- dl_min_chunk_len_key = "dl_min_chunk_len"
30
28
  output_chunk_column_name_key = "output_chunk_column_name"
31
- output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
32
29
  output_jsonpath_column_name_key = "output_jsonpath_column_name"
33
30
  output_pageno_column_name_key = "output_pageno_column_name"
34
31
  output_bbox_column_name_key = "output_bbox_column_name"
35
32
  content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}"
36
- doc_id_column_name_cli_param = f"{cli_prefix}{doc_id_column_name_key}"
37
33
  chunking_type_cli_param = f"{cli_prefix}{chunking_type_key}"
38
- dl_min_chunk_len_cli_param = f"{cli_prefix}{dl_min_chunk_len_key}"
39
34
  output_chunk_column_name_cli_param = f"{cli_prefix}{output_chunk_column_name_key}"
40
- output_source_doc_id_column_name_cli_param = f"{cli_prefix}{output_source_doc_id_column_name_key}"
41
35
  output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
42
36
  output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
43
37
  output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"
@@ -52,11 +46,8 @@ class chunking_types(str, enum.Enum):
52
46
 
53
47
 
54
48
  default_content_column_name = "contents"
55
- default_doc_id_column_name = "document_id"
56
49
  default_chunking_type = chunking_types.DL_JSON
57
- default_dl_min_chunk_len = None
58
50
  default_output_chunk_column_name = "contents"
59
- default_output_source_doc_id_column_name = "source_document_id"
60
51
  default_output_jsonpath_column_name = "doc_jsonpath"
61
52
  default_output_pageno_column_name = "page_number"
62
53
  default_output_bbox_column_name = "bbox"
@@ -82,12 +73,9 @@ class DocChunkTransform(AbstractTableTransform):
82
73
  self.chunking_type = config.get(chunking_type_key, default_chunking_type)
83
74
 
84
75
  self.content_column_name = config.get(content_column_name_key, default_content_column_name)
85
- self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
86
76
  self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
87
- self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)
88
77
 
89
78
  # Parameters for Docling JSON chunking
90
- self.dl_min_chunk_len = config.get(dl_min_chunk_len_key, default_dl_min_chunk_len)
91
79
  self.output_jsonpath_column_name = config.get(
92
80
  output_jsonpath_column_name_key, default_output_jsonpath_column_name
93
81
  )
@@ -101,7 +89,6 @@ class DocChunkTransform(AbstractTableTransform):
101
89
  self.chunker: ChunkingExecutor
102
90
  if self.chunking_type == chunking_types.DL_JSON:
103
91
  self.chunker = DLJsonChunker(
104
- min_chunk_len=self.dl_min_chunk_len,
105
92
  output_chunk_column_name=self.output_chunk_column_name,
106
93
  output_jsonpath_column_name=self.output_jsonpath_column_name,
107
94
  output_pageno_column_name_key=self.output_pageno_column_name_key,
@@ -125,11 +112,8 @@ class DocChunkTransform(AbstractTableTransform):
125
112
  for batch in table.to_batches():
126
113
  for row in batch.to_pylist():
127
114
  content: str = row[self.content_column_name]
128
- new_row = {k: v for k, v in row.items() if k not in (self.content_column_name, self.doc_id_column_name)}
129
- if self.doc_id_column_name in row:
130
- new_row[self.output_source_doc_id_column_name] = row[self.doc_id_column_name]
115
+ new_row = {k: v for k, v in row.items() if k not in (self.content_column_name,)}
131
116
  for chunk in self.chunker.chunk(content):
132
- chunk[self.doc_id_column_name] = TransformUtils.str_to_hash(chunk[self.output_chunk_column_name])
133
117
  data.append(
134
118
  {
135
119
  **new_row,
@@ -178,26 +162,11 @@ class DocChunkTransformConfiguration(TransformConfiguration):
178
162
  default=default_content_column_name,
179
163
  help="Name of the column containing the text to be chunked",
180
164
  )
181
- parser.add_argument(
182
- f"--{doc_id_column_name_cli_param}",
183
- default=default_doc_id_column_name,
184
- help="Name of the column containing the doc_id to be propagated in the output",
185
- )
186
- parser.add_argument(
187
- f"--{dl_min_chunk_len_cli_param}",
188
- default=default_dl_min_chunk_len,
189
- help="Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a min_chunk_len=64.",
190
- )
191
165
  parser.add_argument(
192
166
  f"--{output_chunk_column_name_cli_param}",
193
167
  default=default_output_chunk_column_name,
194
168
  help="Column name to store the chunks",
195
169
  )
196
- parser.add_argument(
197
- f"--{output_source_doc_id_column_name_cli_param}",
198
- default=default_output_source_doc_id_column_name,
199
- help="Column name to store the `document_id` from the input table",
200
- )
201
170
  parser.add_argument(
202
171
  f"--{output_jsonpath_column_name_cli_param}",
203
172
  default=default_output_jsonpath_column_name,
@@ -16,13 +16,12 @@ import sys
16
16
  from data_processing.runtime.pure_python import PythonTransformLauncher
17
17
  from data_processing.utils import ParamsUtils
18
18
  from doc_quality_transform import (
19
- bad_word_filepath_cli_param,
20
- doc_content_column_cli_param,
21
19
  text_lang_cli_param,
20
+ doc_content_column_cli_param,
21
+ bad_word_filepath_cli_param,
22
22
  )
23
23
  from doc_quality_transform_python import DocQualityPythonTransformConfiguration
24
24
 
25
-
26
25
  # create parameters
27
26
  input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
28
27
  output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
@@ -32,7 +31,7 @@ local_conf = {
32
31
  }
33
32
  code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
34
33
  basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
35
- model_path = os.path.join(basedir, "models")
34
+ model_path=os.path.join(basedir, "models")
36
35
  if not os.path.exists(model_path):
37
36
  model_path = os.path.abspath(os.path.join(basedir, "..", "models"))
38
37
 
@@ -10,12 +10,12 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
- import os
14
13
  from argparse import ArgumentParser, Namespace
15
14
  from typing import Any
16
15
 
16
+ import os
17
17
  import pyarrow as pa
18
- from data_processing.data_access import DataAccess, DataAccessFactory
18
+ from data_processing.data_access import DataAccessFactory, DataAccess
19
19
  from data_processing.transform import AbstractTableTransform, TransformConfiguration
20
20
  from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
21
21
  from doc_c4_statistics import (
@@ -32,7 +32,6 @@ from doc_Gopher_statistics import (
32
32
  find_first_japanese_alphabet_position,
33
33
  )
34
34
 
35
-
36
35
  logger = get_logger(__name__)
37
36
 
38
37
  short_name = "docq"
@@ -50,7 +49,6 @@ default_doc_content_column = "contents"
50
49
  data_factory_internal_key = f"{cli_prefix}data_factory"
51
50
  files_to_use_internal_key = f"{cli_prefix}files_to_use"
52
51
 
53
-
54
52
  class DocQualityTransform(AbstractTableTransform):
55
53
  """
56
54
  Implements a transform to calculate document quality.
@@ -67,7 +65,7 @@ class DocQualityTransform(AbstractTableTransform):
67
65
  super().__init__(config)
68
66
  self.text_lang = config.get(text_lang_key, default_text_lang)
69
67
  self.doc_content_column = config.get(doc_content_column_key, default_doc_content_column)
70
-
68
+
71
69
  daf = config.get(data_factory_internal_key, None)
72
70
  bad_word_filepath = config.get(bad_word_filepath_key, None)
73
71
  if bad_word_filepath is not None:
@@ -75,14 +73,11 @@ class DocQualityTransform(AbstractTableTransform):
75
73
  logger.info(f"Load badwords found locally from {bad_word_filepath}")
76
74
  self.re_pattern = c4_load_ldnoobw_words(ft_lang=self.text_lang, file_path=bad_word_filepath)
77
75
  else:
78
- if daf is None:
79
- raise RuntimeError(
80
- f"Did not find DataAccessFactory instance under {data_factory_internal_key} key. This is required when bad word file is not in the local file system."
81
- )
76
+ if daf is None:
77
+ raise RuntimeError(f"Did not find DataAccessFactory instance under {data_factory_internal_key} key. This is required when bad word file is not in the local file system.")
82
78
  logger.info(f"Load badwords from remote")
83
79
  data_access = daf.create_data_access()
84
80
  import tempfile
85
-
86
81
  with tempfile.TemporaryDirectory() as temp_dir:
87
82
  # use a temporary directory until model is loaded to memory
88
83
  bad_word_filepath = self._write_locally(data_access, bad_word_filepath, temp_dir)
@@ -92,7 +87,7 @@ class DocQualityTransform(AbstractTableTransform):
92
87
  filename = os.path.basename(path)
93
88
  content, _ = data_access.get_file(path)
94
89
  temp_file_path = os.path.join(temp_dir, filename)
95
- with open(temp_file_path, "wb") as temp_file:
90
+ with open(temp_file_path, 'wb') as temp_file:
96
91
  temp_file.write(content)
97
92
  return temp_file_path
98
93
 
@@ -190,7 +185,6 @@ class DocQualityTransformConfiguration(TransformConfiguration):
190
185
  Provides support for configuring and using the associated Transform class include
191
186
  configuration with CLI args.
192
187
  """
193
-
194
188
  def __init__(self):
195
189
  super().__init__(
196
190
  name=short_name,
@@ -207,7 +201,9 @@ class DocQualityTransformConfiguration(TransformConfiguration):
207
201
  (e.g, noop_, pii_, etc.)
208
202
  """
209
203
  parser.add_argument(
210
- f"--{text_lang_cli_param}", default=default_text_lang, help="language used in the text content"
204
+ f"--{text_lang_cli_param}",
205
+ default=default_text_lang,
206
+ help="language used in the text content"
211
207
  )
212
208
  parser.add_argument(
213
209
  f"--{doc_content_column_cli_param}",
@@ -229,13 +225,9 @@ class DocQualityTransformConfiguration(TransformConfiguration):
229
225
  :return: True, if validate pass or False otherwise
230
226
  """
231
227
  captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
232
- self.params = (
233
- self.params
234
- | captured
235
- | {
236
- data_factory_internal_key: self.daf,
237
- }
238
- )
228
+ self.params = self.params | captured | {
229
+ data_factory_internal_key: self.daf,
230
+ }
239
231
  logger.info(f"doc_quality parameters are : {self.params}")
240
232
  # Validate and populate the transform's DataAccessFactory
241
233
  return self.daf.apply_input_params(args)
@@ -13,10 +13,7 @@
13
13
  import os
14
14
 
15
15
  from data_processing.data_access import DataAccessLocal
16
- from ededup_transform_base import HashFilter
17
- from ededup_transform_python import EdedupTransform
18
- from ededup_transform_base import doc_column_name_key, int_column_name_key
19
-
16
+ from ededup_transform_python import EdedupPythonTransform
20
17
 
21
18
  # create parameters
22
19
  input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
@@ -26,18 +23,18 @@ local_conf = {
26
23
  "output_folder": output_folder,
27
24
  }
28
25
 
29
- ededup_params = {doc_column_name_key: "contents", int_column_name_key: "document_id", "filter": HashFilter({})}
26
+ ededup_params = {"doc_column": "contents"}
30
27
 
31
28
  if __name__ == "__main__":
32
29
  # Here we show how to run outside of ray
33
30
  # Filter transform needs a DataAccess to ready the domain list.
34
31
  data_access = DataAccessLocal(local_conf)
35
32
  # Create and configure the transform.
36
- transform = EdedupTransform(ededup_params)
33
+ transform = EdedupPythonTransform(ededup_params)
37
34
  # Use the local data access to read a parquet table.
38
35
  table, _ = data_access.get_table(os.path.join(input_folder, "sample1.parquet"))
39
- print(f"input table has {table.num_rows} rows and {table.num_columns} columns")
36
+ print(f"input table has {table.num_rows} rows")
40
37
  # Transform the table
41
38
  table_list, metadata = transform.transform(table)
42
- print(f"\noutput table has {table_list[0].num_rows} rows and {table_list[0].num_columns} columns")
39
+ print(f"\noutput table has {table_list[0].num_rows} rows")
43
40
  print(f"output metadata : {metadata}")
@@ -13,14 +13,13 @@
13
13
  import os
14
14
  import sys
15
15
 
16
- from data_processing.runtime.pure_python import PythonTransformLauncher
17
16
  from data_processing.utils import ParamsUtils
18
- from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration
19
- from ededup_transform_base import doc_column_name_cli_param, int_column_name_cli_param
17
+ from data_processing.runtime.pure_python import PythonTransformLauncher
18
+ from ededup_transform_python import EdedupPythonTransformConfiguration
20
19
 
21
20
 
22
21
  # create launcher
23
- launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration())
22
+ launcher = PythonTransformLauncher(EdedupPythonTransformConfiguration())
24
23
  # create parameters
25
24
  input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
26
25
  output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
@@ -37,8 +36,7 @@ params = {
37
36
  "runtime_job_id": "job_id",
38
37
  "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
39
38
  # ededup parameters
40
- doc_column_name_cli_param: "contents",
41
- int_column_name_cli_param: "document_id",
39
+ "ededup_doc_column": "contents",
42
40
  }
43
41
  sys.argv = ParamsUtils.dict_to_req(d=params)
44
42