data-prep-toolkit-transforms 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data_prep_toolkit_transforms-0.2.1/Makefile +62 -0
  2. data_prep_toolkit_transforms-0.2.1/PKG-INFO +73 -0
  3. data_prep_toolkit_transforms-0.2.1/README.md +39 -0
  4. data_prep_toolkit_transforms-0.2.1/pyproject.toml +39 -0
  5. data_prep_toolkit_transforms-0.2.1/requirements.txt +31 -0
  6. data_prep_toolkit_transforms-0.2.1/setup.cfg +4 -0
  7. data_prep_toolkit_transforms-0.2.1/src/cc_net_prepro.py +168 -0
  8. data_prep_toolkit_transforms-0.2.1/src/code2parquet_local.py +51 -0
  9. data_prep_toolkit_transforms-0.2.1/src/code2parquet_local_python.py +60 -0
  10. data_prep_toolkit_transforms-0.2.1/src/code2parquet_s3_python.py +61 -0
  11. data_prep_toolkit_transforms-0.2.1/src/code2parquet_transform.py +222 -0
  12. data_prep_toolkit_transforms-0.2.1/src/code2parquet_transform_python.py +39 -0
  13. data_prep_toolkit_transforms-0.2.1/src/code_quality_local.py +37 -0
  14. data_prep_toolkit_transforms-0.2.1/src/code_quality_local_python.py +50 -0
  15. data_prep_toolkit_transforms-0.2.1/src/code_quality_transform.py +312 -0
  16. data_prep_toolkit_transforms-0.2.1/src/code_quality_transform_python.py +26 -0
  17. data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/PKG-INFO +73 -0
  18. data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/SOURCES.txt +85 -0
  19. data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/dependency_links.txt +1 -0
  20. data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/requires.txt +26 -0
  21. data_prep_toolkit_transforms-0.2.1/src/data_prep_toolkit_transforms.egg-info/top_level.txt +76 -0
  22. data_prep_toolkit_transforms-0.2.1/src/doc_Gopher_statistics.py +158 -0
  23. data_prep_toolkit_transforms-0.2.1/src/doc_c4_statistics.py +167 -0
  24. data_prep_toolkit_transforms-0.2.1/src/doc_chunk_chunkers.py +68 -0
  25. data_prep_toolkit_transforms-0.2.1/src/doc_chunk_local.py +34 -0
  26. data_prep_toolkit_transforms-0.2.1/src/doc_chunk_local_python.py +49 -0
  27. data_prep_toolkit_transforms-0.2.1/src/doc_chunk_transform.py +227 -0
  28. data_prep_toolkit_transforms-0.2.1/src/doc_chunk_transform_python.py +43 -0
  29. data_prep_toolkit_transforms-0.2.1/src/doc_id_local.py +54 -0
  30. data_prep_toolkit_transforms-0.2.1/src/doc_id_local_python.py +52 -0
  31. data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_base.py +177 -0
  32. data_prep_toolkit_transforms-0.2.1/src/doc_id_transform_python.py +120 -0
  33. data_prep_toolkit_transforms-0.2.1/src/doc_quality_local.py +43 -0
  34. data_prep_toolkit_transforms-0.2.1/src/doc_quality_local_python.py +57 -0
  35. data_prep_toolkit_transforms-0.2.1/src/doc_quality_transform.py +241 -0
  36. data_prep_toolkit_transforms-0.2.1/src/doc_quality_transform_python.py +42 -0
  37. data_prep_toolkit_transforms-0.2.1/src/doc_quality_utils.py +67 -0
  38. data_prep_toolkit_transforms-0.2.1/src/ededup_local.py +43 -0
  39. data_prep_toolkit_transforms-0.2.1/src/ededup_local_python.py +46 -0
  40. data_prep_toolkit_transforms-0.2.1/src/ededup_local_python_incremental.py +53 -0
  41. data_prep_toolkit_transforms-0.2.1/src/ededup_transform_base.py +249 -0
  42. data_prep_toolkit_transforms-0.2.1/src/ededup_transform_python.py +145 -0
  43. data_prep_toolkit_transforms-0.2.1/src/filter_local.py +58 -0
  44. data_prep_toolkit_transforms-0.2.1/src/filter_local_python.py +60 -0
  45. data_prep_toolkit_transforms-0.2.1/src/filter_test_support.py +135 -0
  46. data_prep_toolkit_transforms-0.2.1/src/filter_transform.py +192 -0
  47. data_prep_toolkit_transforms-0.2.1/src/filter_transform_python.py +31 -0
  48. data_prep_toolkit_transforms-0.2.1/src/flair_recognizer.py +149 -0
  49. data_prep_toolkit_transforms-0.2.1/src/header_cleanser_local.py +52 -0
  50. data_prep_toolkit_transforms-0.2.1/src/header_cleanser_local_python.py +53 -0
  51. data_prep_toolkit_transforms-0.2.1/src/header_cleanser_test_support.py +94 -0
  52. data_prep_toolkit_transforms-0.2.1/src/header_cleanser_transform.py +224 -0
  53. data_prep_toolkit_transforms-0.2.1/src/header_cleanser_transform_python.py +31 -0
  54. data_prep_toolkit_transforms-0.2.1/src/lang_id_local.py +49 -0
  55. data_prep_toolkit_transforms-0.2.1/src/lang_id_local_python.py +55 -0
  56. data_prep_toolkit_transforms-0.2.1/src/lang_id_transform.py +141 -0
  57. data_prep_toolkit_transforms-0.2.1/src/lang_id_transform_python.py +42 -0
  58. data_prep_toolkit_transforms-0.2.1/src/lang_models.py +52 -0
  59. data_prep_toolkit_transforms-0.2.1/src/nlp.py +46 -0
  60. data_prep_toolkit_transforms-0.2.1/src/pdf2parquet_local.py +39 -0
  61. data_prep_toolkit_transforms-0.2.1/src/pdf2parquet_local_python.py +55 -0
  62. data_prep_toolkit_transforms-0.2.1/src/pdf2parquet_transform.py +332 -0
  63. data_prep_toolkit_transforms-0.2.1/src/pdf2parquet_transform_python.py +42 -0
  64. data_prep_toolkit_transforms-0.2.1/src/pii_analyzer.py +71 -0
  65. data_prep_toolkit_transforms-0.2.1/src/pii_anonymizer.py +27 -0
  66. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local.py +37 -0
  67. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_local_python.py +37 -0
  68. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform.py +152 -0
  69. data_prep_toolkit_transforms-0.2.1/src/pii_redactor_transform_python.py +35 -0
  70. data_prep_toolkit_transforms-0.2.1/src/proglang_select_local.py +51 -0
  71. data_prep_toolkit_transforms-0.2.1/src/proglang_select_local_python.py +61 -0
  72. data_prep_toolkit_transforms-0.2.1/src/proglang_select_transform.py +167 -0
  73. data_prep_toolkit_transforms-0.2.1/src/proglang_select_transform_python.py +32 -0
  74. data_prep_toolkit_transforms-0.2.1/src/resize_local.py +36 -0
  75. data_prep_toolkit_transforms-0.2.1/src/resize_local_python.py +46 -0
  76. data_prep_toolkit_transforms-0.2.1/src/resize_transform.py +193 -0
  77. data_prep_toolkit_transforms-0.2.1/src/resize_transform_python.py +40 -0
  78. data_prep_toolkit_transforms-0.2.1/src/text_encoder_local.py +44 -0
  79. data_prep_toolkit_transforms-0.2.1/src/text_encoder_local_python.py +44 -0
  80. data_prep_toolkit_transforms-0.2.1/src/text_encoder_transform.py +127 -0
  81. data_prep_toolkit_transforms-0.2.1/src/text_encoder_transform_python.py +44 -0
  82. data_prep_toolkit_transforms-0.2.1/src/tokenization_local_long_doc_python.py +49 -0
  83. data_prep_toolkit_transforms-0.2.1/src/tokenization_local_python.py +40 -0
  84. data_prep_toolkit_transforms-0.2.1/src/tokenization_s3_long_doc_python.py +52 -0
  85. data_prep_toolkit_transforms-0.2.1/src/tokenization_transform.py +258 -0
  86. data_prep_toolkit_transforms-0.2.1/src/tokenization_transform_python.py +27 -0
  87. data_prep_toolkit_transforms-0.2.1/src/tokenization_utils.py +143 -0
@@ -0,0 +1,62 @@
1
+ # Define the root of the local git clone for the common rules to be able
2
+ # know where they are running from.
3
+ REPOROOT=../../..
4
+ # Include a library of common .transform.* targets which most
5
+ # transforms should be able to reuse. However, feel free
6
+ # to override/redefine the rules below.
7
+
8
+ # $(REPOROOT)/.make.versions file contains the versions
9
+
10
+ include $(REPOROOT)/transforms/.make.transforms
11
+ include ../.make.packaging
12
+
13
+ PACKAGING_RUN_TIME=python
14
+
15
+
16
+ #Excluded List
17
+ # ./code/malware
18
+ # ./universal/html2parquet
19
+ # ./universal/profiler # Missing implementation
20
+ # ./universal/fdedup # Missing implementation
21
+ # code/repo_level_ordering # Missing implementation
22
+
23
+
24
+ TRANSFORMS_NAMES = code/code_quality \
25
+ code/code2parquet \
26
+ code/header_cleanser \
27
+ code/proglang_select \
28
+ language/doc_chunk \
29
+ language/doc_quality \
30
+ language/lang_id \
31
+ language/pdf2parquet \
32
+ language/pii_redactor \
33
+ language/text_encoder \
34
+ universal/ededup \
35
+ universal/filter \
36
+ universal/resize \
37
+ universal/tokenization \
38
+ universal/doc_id
39
+
40
+
41
+ # distribution versions is the same as image version.
42
+ set-versions:
43
+ $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions
44
+
45
+ test-src::
46
+ $(MAKE) src
47
+ $(MAKE) .transforms.python-venv
48
+ $(MAKE) run-ut
49
+ @# Help: Do any default transform setup before running make src and setting up a test environment
50
+
51
+ test-with-pypi:
52
+ -rm -fr venv
53
+ $(MAKE) .defaults.create-venv
54
+ source venv/bin/activate; \
55
+ # $(PYTHON) -m pip install dist/data_prep_toolkit_transforms-$(DPK_TRANSFORMS_VERSION)-py3-none-any.whl
56
+ # $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRANSFORMS_VERSION)
57
+ $(MAKE) run-ut
58
+ @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi)
59
+
60
+
61
+
62
+
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_toolkit_transforms
3
+ Version: 0.2.1
4
+ Summary: Data Preparation Toolkit Transforms
5
+ Author-email: Maroun Touma <touma@us.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: transforms,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: <3.12,>=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: data-prep-toolkit>=0.2.1
11
+ Requires-Dist: bs4==0.0.2
12
+ Requires-Dist: docling-ibm-models==1.1.7
13
+ Requires-Dist: deepsearch-glm==0.21.0
14
+ Requires-Dist: docling==1.11.0
15
+ Requires-Dist: filetype<2.0.0,>=1.2.0
16
+ Requires-Dist: docling-core==1.3.0
17
+ Requires-Dist: llama-index-core<0.12.0,>=0.11.0
18
+ Requires-Dist: duckdb==0.10.1
19
+ Requires-Dist: fasttext==0.9.2
20
+ Requires-Dist: huggingface-hub<1.0.0,>=0.21.4
21
+ Requires-Dist: langcodes==3.3.0
22
+ Requires-Dist: mmh3==4.1.0
23
+ Requires-Dist: numpy==1.26.4
24
+ Requires-Dist: pandas
25
+ Requires-Dist: parameterized
26
+ Requires-Dist: scancode-toolkit==32.1.0; platform_system != "Darwin"
27
+ Requires-Dist: sentence-transformers==3.0.1
28
+ Requires-Dist: transformers==4.38.2
29
+ Requires-Dist: xxhash==3.4.1
30
+ Requires-Dist: presidio-analyzer>=2.2.355
31
+ Requires-Dist: presidio-anonymizer>=2.2.355
32
+ Requires-Dist: flair>=0.14.0
33
+ Requires-Dist: pandas>=2.2.2
34
+
35
+ # DPK Python Transforms
36
+
37
+ ## installation
38
+
39
+ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:
40
+
41
+ `python -m pip install data-prep-toolkit-transforms`
42
+
43
+ installing the python transforms will also install `data-prep-toolkit`
44
+
45
+ ## List of Transforms in current package
46
+
47
+ Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components
48
+
49
+ * code
50
+ * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md)
51
+ * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/header_cleanser/python/README.md)
52
+ * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/python/README.md)
53
+ * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/python/README.md)
54
+ * language
55
+ * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/python/README.md)
56
+ * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/python/README.md)
57
+ * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/python/README.md)
58
+ * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/python/README.md)
59
+ * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/python/README.md)
60
+ * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/python/README.md)
61
+ * universal
62
+ * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/python/README.md)
63
+ * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/python/README.md)
64
+ * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/python/README.md)
65
+ * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/tokenization/doc_chunk/python/README.md)
66
+ * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/python/README.md)
67
+
68
+
69
+
70
+
71
+
72
+
73
+
@@ -0,0 +1,39 @@
1
+ # DPK Python Transforms
2
+
3
+ ## installation
4
+
5
+ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:
6
+
7
+ `python -m pip install data-prep-toolkit-transforms`
8
+
9
+ installing the python transforms will also install `data-prep-toolkit`
10
+
11
+ ## List of Transforms in current package
12
+
13
+ Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components
14
+
15
+ * code
16
+ * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md)
17
+ * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/header_cleanser/python/README.md)
18
+ * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/python/README.md)
19
+ * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/python/README.md)
20
+ * language
21
+ * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/python/README.md)
22
+ * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/python/README.md)
23
+ * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/python/README.md)
24
+ * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/python/README.md)
25
+ * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/python/README.md)
26
+ * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/python/README.md)
27
+ * universal
28
+ * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/python/README.md)
29
+ * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/python/README.md)
30
+ * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/python/README.md)
31
+ * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/tokenization/doc_chunk/python/README.md)
32
+ * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/python/README.md)
33
+
34
+
35
+
36
+
37
+
38
+
39
+
@@ -0,0 +1,39 @@
1
+ [project]
2
+ name = "data_prep_toolkit_transforms"
3
+ version = "0.2.1"
4
+ requires-python = ">=3.10,<3.12"
5
+ keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
6
+ description = "Data Preparation Toolkit Transforms"
7
+ license = {text = "Apache-2.0"}
8
+ readme = {file = "README.md", content-type = "text/markdown"}
9
+ authors = [
10
+ { name = "Maroun Touma", email = "touma@us.ibm.com" },
11
+ ]
12
+ dynamic = ["dependencies"]
13
+
14
+ [build-system]
15
+ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
16
+ build-backend = "setuptools.build_meta"
17
+
18
+ [tool.setuptools.dynamic]
19
+ dependencies = {file = ["requirements.txt"]}
20
+
21
+ [options]
22
+ package_dir = ["src"]
23
+
24
+ [options.packages.find]
25
+ where = ["src/"]
26
+
27
+ [tool.pytest.ini_options]
28
+ # Currently we use low coverage since we have to run tests separately (see makefile)
29
+ #addopts = "--cov --cov-report term-missing --cov-fail-under 25"
30
+ markers = ["unit: unit tests", "integration: integration tests"]
31
+
32
+ [tool.coverage.run]
33
+ include = ["src/*"]
34
+
35
+
36
+
37
+
38
+
39
+
@@ -0,0 +1,31 @@
1
+ data-prep-toolkit>=0.2.1
2
+ bs4==0.0.2
3
+ #pdf2parquet
4
+ # conflict with chunking....
5
+ #docling-core==1.2.0,
6
+ docling-ibm-models==1.1.7,
7
+ deepsearch-glm==0.21.0,
8
+ docling==1.11.0,
9
+ filetype >=1.2.0, <2.0.0,
10
+ #Doc chunking
11
+ docling-core==1.3.0,
12
+ llama-index-core>=0.11.0,<0.12.0,
13
+ duckdb==0.10.1
14
+ fasttext==0.9.2
15
+ huggingface-hub >= 0.21.4, <1.0.0
16
+ langcodes==3.3.0
17
+ mmh3==4.1.0
18
+ numpy==1.26.4
19
+ pandas
20
+ parameterized
21
+ scancode-toolkit==32.1.0 ; platform_system != 'Darwin'
22
+ sentence-transformers==3.0.1
23
+ transformers==4.38.2
24
+ xxhash==3.4.1
25
+ # PII-redactor
26
+ presidio-analyzer>=2.2.355
27
+ presidio-anonymizer>=2.2.355
28
+ flair>=0.14.0
29
+ pandas>=2.2.2
30
+
31
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,168 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ """
14
+ @ Create on: 2023/04/25
15
+ @ Description:
16
+ To incorporate preprocessing steps from cc_net/
17
+
18
+ @ Reference:
19
+ https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
20
+ https://github.com/bigscience-workshop/data_tooling/blob/master/kenlm_training/cc_net/text_normalizer.py
21
+ """
22
+
23
+ import re
24
+ import unicodedata
25
+ from typing import Dict
26
+
27
+
28
+ unicode_punct_dict: Dict[str, str] = {
29
+ ",": ",",
30
+ "。": ".",
31
+ "、": ",",
32
+ "„": '"',
33
+ "”": '"',
34
+ "“": '"',
35
+ "«": '"',
36
+ "»": '"',
37
+ "1": '"',
38
+ "」": '"',
39
+ "「": '"',
40
+ "《": '"',
41
+ "》": '"',
42
+ "´": "'",
43
+ "∶": ":",
44
+ ":": ":",
45
+ "?": "?",
46
+ "!": "!",
47
+ "(": "(",
48
+ ")": ")",
49
+ ";": ";",
50
+ "–": "-",
51
+ "—": " - ",
52
+ ".": ". ",
53
+ "~": "~",
54
+ "’": "'",
55
+ "…": "...",
56
+ "━": "-",
57
+ "〈": "<",
58
+ "〉": ">",
59
+ "【": "[",
60
+ "】": "]",
61
+ "%": "%",
62
+ "►": "-",
63
+ }
64
+
65
+ _unicode_punct_re = re.compile(f"[{''.join(unicode_punct_dict.keys())}]")
66
+
67
+ """
68
+ Generate regex pattern obj for later searching using re.search() or re.match()
69
+ (r'[\x00\x01...\x9e\x9f]',re.UNICODE)
70
+ """
71
+ _non_printing_chars_re = re.compile(f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]")
72
+
73
+
74
+ def unicode_normalization(line: str, language="en") -> str:
75
+ """
76
+ For some languages like 'ja' or 'en', its text should be normalized with the right unicode format
77
+ prior to calling tokenization by sentence piece tokenizer
78
+ """
79
+ if language == "ja":
80
+ line = unicodedata.normalize("NFKC", line)
81
+ elif language == "en":
82
+ line = unicodedata.normalize("NFD", line) # normalize line using Unicode Normalization Form D
83
+ else:
84
+ """
85
+ TODO: add relevant unicodedata normalization here for other languages as needed
86
+ """
87
+ return line
88
+
89
+ return line
90
+
91
+
92
+ def _strip_accents(line) -> str:
93
+ """
94
+ This currently is applied for `en` and `ja` language to strip out accents from text.
95
+ The given text 'line' should be normalized with the right unicode format prior to calling this method.
96
+ For example:
97
+ line = "Café élevàtor ôperàtor naïve Noël façade don't"
98
+ -> "Cafe elevator operator naive Noel facade don't" if line was normalized with unicode format NFD
99
+ -> "Cafe elevator operator naïve Noël façade don't" if line was NOT normalized with any unicode format
100
+ """
101
+
102
+ """Keep char whose category is NOT "Mn", i.e,
103
+ "Mn" is category for Mark or Non-Spacing characters like diacritic/accent or non-spacing marks
104
+ Example of some diacritic/non-spacing marks: ^,´,` as they don't occupy space
105
+ """
106
+ output = [c for c in line if unicodedata.category(c) != "Mn"] # decompose line into chars and diacritical marks
107
+ if len(output) == line:
108
+ return line
109
+ return "".join(output)
110
+
111
+
112
+ def _replace_unicode_punct(line: str) -> str:
113
+ """
114
+ Replace unicode punctuations defined in `unicode_punct_dict'
115
+ """
116
+ return "".join(unicode_punct_dict.get(c, c) for c in line)
117
+
118
+
119
+ def _remove_unicode_punct(line: str) -> str:
120
+ """
121
+ More aggressive _replace_unicode_punct
122
+ """
123
+ return _unicode_punct_re.sub("", line)
124
+
125
+
126
+ def _remove_non_printing_char(line: str) -> str:
127
+ return _non_printing_chars_re.sub("", line)
128
+
129
+
130
+ def cc_net_normalize(
131
+ line: str,
132
+ strip_accent: bool = True,
133
+ lower_case: bool = True,
134
+ digit_2_zero: bool = True,
135
+ punct_level: int = 1,
136
+ language: str = "en",
137
+ ) -> str:
138
+ line = line.strip()
139
+
140
+ if not line:
141
+ return line
142
+
143
+ line = unicode_normalization(line=line, language=language)
144
+
145
+ if lower_case:
146
+ line = line.lower()
147
+
148
+ if strip_accent:
149
+ line = _strip_accents(line)
150
+
151
+ if digit_2_zero:
152
+ # eg, "int 10 float 2.01 scientific 1.2e10" -> "int 00 float 0.00 scientific 0.0e00"
153
+ line = re.compile(r"\d").sub("0", line)
154
+
155
+ if punct_level == 1:
156
+ line = _replace_unicode_punct(line)
157
+
158
+ elif punct_level == 2:
159
+ line = _remove_unicode_punct(line)
160
+
161
+ line = _remove_non_printing_char(line)
162
+ return line
163
+
164
+
165
+ if __name__ == "__main__":
166
+ line = "Int 10 float 2.01 scientific 1.2e10 Café ôperàtor"
167
+ new_line = cc_net_normalize(line)
168
+ print(f"== {line} -> {new_line}")
@@ -0,0 +1,51 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import ast
14
+ import os
15
+
16
+ from code2parquet_transform import ( # domain_key,; snapshot_key,
17
+ CodeToParquetTransform,
18
+ data_factory_key,
19
+ detect_programming_lang_key,
20
+ supported_langs_file_key,
21
+ )
22
+ from data_processing.data_access import DataAccessFactory, DataAccessLocal
23
+
24
+
25
+ supported_languages_file = os.path.abspath(
26
+ os.path.join(os.path.dirname(__file__), "../../ray/test-data/languages/lang_extensions.json")
27
+ )
28
+ input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
29
+
30
+ params = {
31
+ supported_langs_file_key: supported_languages_file,
32
+ detect_programming_lang_key: True,
33
+ # snapshot_key: "github",
34
+ # domain_key: "code",
35
+ "data_files_to_use": ast.literal_eval("['.zip']"),
36
+ data_factory_key: DataAccessFactory(), # Expect to create DataAccessLocal
37
+ }
38
+
39
+ if __name__ == "__main__":
40
+ # Here we show how to run outside of ray
41
+ # Create and configure the transform.
42
+ # transform = CodeToParquetPythonTransform(params)
43
+ transform = CodeToParquetTransform(params)
44
+ # Use the local data access to read a parquet table.
45
+ data_access = DataAccessLocal()
46
+ file_to_process = os.path.join(input_folder, "application-java.zip")
47
+ byte_array, _ = data_access.get_file(file_to_process)
48
+ # Transform the table
49
+ files_list, metadata = transform.transform_binary(file_name=file_to_process, byte_array=byte_array)
50
+ print(f"Got {len(files_list)} output files")
51
+ print(f"output metadata : {metadata}")
@@ -0,0 +1,60 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import ast
14
+ import os
15
+ import sys
16
+
17
+ from code2parquet_transform import ( # domain_key,; snapshot_key,
18
+ detect_programming_lang_cli_key,
19
+ supported_langs_file_cli_key,
20
+ )
21
+ from code2parquet_transform_python import CodeToParquetPythonConfiguration
22
+ from data_processing.runtime.pure_python import PythonTransformLauncher
23
+ from data_processing.utils import ParamsUtils
24
+
25
+
26
+ # create parameters
27
+ supported_languages_file = os.path.abspath(
28
+ os.path.join(os.path.dirname(__file__), "../test-data/languages/lang_extensions.json")
29
+ )
30
+ input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
31
+ output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
32
+ local_conf = {
33
+ "input_folder": input_folder,
34
+ "output_folder": output_folder,
35
+ }
36
+ worker_options = {"num_cpus": 0.8}
37
+ code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
38
+ ingest_config = {
39
+ supported_langs_file_cli_key: supported_languages_file,
40
+ detect_programming_lang_cli_key: True,
41
+ # snapshot_key: "github",
42
+ # domain_key: "code",
43
+ }
44
+
45
+ params = {
46
+ # Data access. Only required parameters are specified
47
+ "data_local_config": ParamsUtils.convert_to_ast(local_conf),
48
+ "data_files_to_use": ast.literal_eval("['.zip']"),
49
+ # orchestrator
50
+ "runtime_pipeline_id": "pipeline_id",
51
+ "runtime_job_id": "job_id",
52
+ "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
53
+ }
54
+
55
+ if __name__ == "__main__":
56
+ sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))
57
+ # create launcher
58
+ launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration())
59
+ # launch
60
+ launcher.launch()
@@ -0,0 +1,61 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import ast
14
+ import sys
15
+
16
+ from code2parquet_transform import ( # domain_key,; snapshot_key,
17
+ detect_programming_lang_cli_key,
18
+ supported_langs_file_cli_key,
19
+ )
20
+ from code2parquet_transform_python import CodeToParquetPythonConfiguration
21
+ from data_processing.runtime.pure_python import PythonTransformLauncher
22
+ from data_processing.utils import GB, ParamsUtils
23
+
24
+
25
+ # create parameters
26
+ s3_cred = {
27
+ "access_key": "localminioaccesskey",
28
+ "secret_key": "localminiosecretkey",
29
+ "url": "http://localhost:9000",
30
+ }
31
+ s3_conf = {
32
+ "input_folder": "test/ingest_2_parquet/input",
33
+ "output_folder": "test/ingest_2_parquet/output",
34
+ }
35
+ worker_options = {"num_cpus": 0.8, "memory": 2 * GB}
36
+ code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
37
+ ingest_config = {
38
+ supported_langs_file_cli_key: "test/ingest_2_parquet/languages/lang_extensions.json",
39
+ detect_programming_lang_cli_key: True,
40
+ # snapshot_key: "github",
41
+ # domain_key: "code",
42
+ "code2parquet_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
43
+ }
44
+
45
+ params = {
46
+ # Data access. Only required parameters are specified
47
+ "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
48
+ "data_s3_config": ParamsUtils.convert_to_ast(s3_conf),
49
+ "data_files_to_use": ast.literal_eval("['.zip']"),
50
+ # orchestrator
51
+ "runtime_pipeline_id": "pipeline_id",
52
+ "runtime_job_id": "job_id",
53
+ "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
54
+ }
55
+
56
+ if __name__ == "__main__":
57
+ sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))
58
+ # create launcher
59
+ launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration())
60
+ # launch
61
+ launcher.launch()