auralith-data-pipeline 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. auralith_data_pipeline-0.1.4/.gitignore +78 -0
  2. auralith_data_pipeline-0.1.4/LICENSE +99 -0
  3. auralith_data_pipeline-0.1.4/PKG-INFO +967 -0
  4. auralith_data_pipeline-0.1.4/README.md +866 -0
  5. auralith_data_pipeline-0.1.4/configs/distributed.yaml +71 -0
  6. auralith_data_pipeline-0.1.4/configs/production.yaml +184 -0
  7. auralith_data_pipeline-0.1.4/pyproject.toml +173 -0
  8. auralith_data_pipeline-0.1.4/scripts/bump-version.sh +74 -0
  9. auralith_data_pipeline-0.1.4/scripts/collect.py +98 -0
  10. auralith_data_pipeline-0.1.4/scripts/train_tokenizer.py +271 -0
  11. auralith_data_pipeline-0.1.4/src/auralith_pipeline/__init__.py +34 -0
  12. auralith_data_pipeline-0.1.4/src/auralith_pipeline/_version.py +34 -0
  13. auralith_data_pipeline-0.1.4/src/auralith_pipeline/cli.py +1517 -0
  14. auralith_data_pipeline-0.1.4/src/auralith_pipeline/config/__init__.py +19 -0
  15. auralith_data_pipeline-0.1.4/src/auralith_pipeline/config/pipeline_config.py +351 -0
  16. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/__init__.py +35 -0
  17. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/client.py +73 -0
  18. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/config.py +144 -0
  19. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/coordinator.py +269 -0
  20. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/pipeline.py +353 -0
  21. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/ray_runner.py +154 -0
  22. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/state.py +332 -0
  23. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/strategies.py +93 -0
  24. auralith_data_pipeline-0.1.4/src/auralith_pipeline/distributed/worker.py +509 -0
  25. auralith_data_pipeline-0.1.4/src/auralith_pipeline/extraction/__init__.py +11 -0
  26. auralith_data_pipeline-0.1.4/src/auralith_pipeline/extraction/extractor.py +304 -0
  27. auralith_data_pipeline-0.1.4/src/auralith_pipeline/pipeline.py +685 -0
  28. auralith_data_pipeline-0.1.4/src/auralith_pipeline/preprocessing/__init__.py +36 -0
  29. auralith_data_pipeline-0.1.4/src/auralith_pipeline/preprocessing/compliance.py +232 -0
  30. auralith_data_pipeline-0.1.4/src/auralith_pipeline/preprocessing/deduplication.py +246 -0
  31. auralith_data_pipeline-0.1.4/src/auralith_pipeline/preprocessing/preprocessor.py +273 -0
  32. auralith_data_pipeline-0.1.4/src/auralith_pipeline/preprocessing/quality.py +382 -0
  33. auralith_data_pipeline-0.1.4/src/auralith_pipeline/preprocessing/synthetic.py +237 -0
  34. auralith_data_pipeline-0.1.4/src/auralith_pipeline/security/__init__.py +25 -0
  35. auralith_data_pipeline-0.1.4/src/auralith_pipeline/security/audit.py +212 -0
  36. auralith_data_pipeline-0.1.4/src/auralith_pipeline/security/data_sanitizer.py +273 -0
  37. auralith_data_pipeline-0.1.4/src/auralith_pipeline/security/pii_scrubber.py +372 -0
  38. auralith_data_pipeline-0.1.4/src/auralith_pipeline/security/privacy_config.py +194 -0
  39. auralith_data_pipeline-0.1.4/src/auralith_pipeline/sharding/__init__.py +15 -0
  40. auralith_data_pipeline-0.1.4/src/auralith_pipeline/sharding/shard_writer.py +316 -0
  41. auralith_data_pipeline-0.1.4/src/auralith_pipeline/sources/__init__.py +26 -0
  42. auralith_data_pipeline-0.1.4/src/auralith_pipeline/sources/data_sources.py +364 -0
  43. auralith_data_pipeline-0.1.4/src/auralith_pipeline/sources/video.py +283 -0
  44. auralith_data_pipeline-0.1.4/src/auralith_pipeline/spark/__init__.py +19 -0
  45. auralith_data_pipeline-0.1.4/src/auralith_pipeline/spark/config.py +84 -0
  46. auralith_data_pipeline-0.1.4/src/auralith_pipeline/spark/runner.py +304 -0
  47. auralith_data_pipeline-0.1.4/src/auralith_pipeline/spark/transforms.py +149 -0
  48. auralith_data_pipeline-0.1.4/src/auralith_pipeline/storage/__init__.py +19 -0
  49. auralith_data_pipeline-0.1.4/src/auralith_pipeline/storage/backends.py +350 -0
  50. auralith_data_pipeline-0.1.4/src/auralith_pipeline/tokenization/__init__.py +27 -0
  51. auralith_data_pipeline-0.1.4/src/auralith_pipeline/tokenization/bpe_tokenizer.py +600 -0
  52. auralith_data_pipeline-0.1.4/src/auralith_pipeline/tokenization/multimodal_tokenizer.py +864 -0
  53. auralith_data_pipeline-0.1.4/src/auralith_pipeline/tokenization/tokenizer.py +212 -0
  54. auralith_data_pipeline-0.1.4/src/auralith_pipeline/tokenization/video_tokenizer.py +223 -0
  55. auralith_data_pipeline-0.1.4/src/auralith_pipeline/utils/__init__.py +43 -0
  56. auralith_data_pipeline-0.1.4/src/auralith_pipeline/utils/file_types.py +81 -0
  57. auralith_data_pipeline-0.1.4/src/auralith_pipeline/utils/helpers.py +37 -0
  58. auralith_data_pipeline-0.1.4/src/auralith_pipeline/utils/tracking.py +396 -0
@@ -0,0 +1,78 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ src/auralith_pipeline/_version.py
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+
51
+ # Ruff
52
+ .ruff_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Environments
59
+ .env
60
+ .venv
61
+ env/
62
+ venv/
63
+ ENV/
64
+ env.bak/
65
+ venv.bak/
66
+
67
+ # IDE
68
+ .idea/
69
+ .vscode/
70
+ *.swp
71
+ *.swo
72
+ *~
73
+
74
+ # Project specific
75
+ data/
76
+ *.safetensors
77
+ *.parquet
78
+ !configs/*.yaml
@@ -0,0 +1,99 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity.
18
+
19
+ "You" (or "Your") shall mean an individual or Legal Entity
20
+ exercising permissions granted by this License.
21
+
22
+ "Source" form shall mean the preferred form for making modifications,
23
+ including but not limited to software source code, documentation
24
+ source, and configuration files.
25
+
26
+ "Object" form shall mean any form resulting from mechanical
27
+ transformation or translation of a Source form.
28
+
29
+ "Work" shall mean the work of authorship made available under the
30
+ License, as indicated by a copyright notice.
31
+
32
+ "Derivative Works" shall mean any work that is based on the Work.
33
+
34
+ "Contribution" shall mean any work of authorship submitted to the
35
+ Licensor for inclusion in the Work.
36
+
37
+ "Contributor" shall mean Licensor and any Legal Entity on behalf of
38
+ whom a Contribution has been received by Licensor.
39
+
40
+ 2. Grant of Copyright License. Subject to the terms and conditions of
41
+ this License, each Contributor hereby grants to You a perpetual,
42
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
43
+ copyright license to reproduce, prepare Derivative Works of,
44
+ publicly display, publicly perform, sublicense, and distribute the
45
+ Work and such Derivative Works in Source or Object form.
46
+
47
+ 3. Grant of Patent License. Subject to the terms and conditions of
48
+ this License, each Contributor hereby grants to You a perpetual,
49
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
50
+ patent license to make, have made, use, offer to sell, sell,
51
+ import, and otherwise transfer the Work.
52
+
53
+ 4. Redistribution. You may reproduce and distribute copies of the
54
+ Work or Derivative Works thereof in any medium, with or without
55
+ modifications, and in Source or Object form, provided that You
56
+ meet the following conditions:
57
+
58
+ (a) You must give any other recipients of the Work or
59
+ Derivative Works a copy of this License; and
60
+
61
+ (b) You must cause any modified files to carry prominent notices
62
+ stating that You changed the files; and
63
+
64
+ (c) You must retain, in the Source form of any Derivative Works
65
+ that You distribute, all copyright, patent, trademark, and
66
+ attribution notices from the Source form of the Work.
67
+
68
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
69
+ any Contribution intentionally submitted for inclusion in the Work
70
+ by You to the Licensor shall be under the terms and conditions of
71
+ this License, without any additional terms or conditions.
72
+
73
+ 6. Trademarks. This License does not grant permission to use the trade
74
+ names, trademarks, service marks, or product names of the Licensor.
75
+
76
+ 7. Disclaimer of Warranty. THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY
77
+ OF ANY KIND, EXPRESS OR IMPLIED.
78
+
79
+ 8. Limitation of Liability. IN NO EVENT SHALL ANY CONTRIBUTOR BE LIABLE
80
+ FOR ANY DAMAGES ARISING FROM THE WORK.
81
+
82
+ 9. Accepting Warranty or Additional Liability. You may choose to offer
83
+ warranty or liability obligations for a fee.
84
+
85
+ END OF TERMS AND CONDITIONS
86
+
87
+ Copyright 2024 AuralithAI
88
+
89
+ Licensed under the Apache License, Version 2.0 (the "License");
90
+ you may not use this file except in compliance with the License.
91
+ You may obtain a copy of the License at
92
+
93
+ http://www.apache.org/licenses/LICENSE-2.0
94
+
95
+ Unless required by applicable law or agreed to in writing, software
96
+ distributed under the License is distributed on an "AS IS" BASIS,
97
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
98
+ See the License for the specific language governing permissions and
99
+ limitations under the License.