dalla-data-processing 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. dalla_data_processing-0.0.3/.dockerignore +63 -0
  2. dalla_data_processing-0.0.3/.github/workflows/ci.yml +116 -0
  3. dalla_data_processing-0.0.3/.github/workflows/release.yml +253 -0
  4. dalla_data_processing-0.0.3/.gitignore +69 -0
  5. dalla_data_processing-0.0.3/.pre-commit-config.yaml +16 -0
  6. dalla_data_processing-0.0.3/MANIFEST.in +8 -0
  7. dalla_data_processing-0.0.3/PKG-INFO +147 -0
  8. dalla_data_processing-0.0.3/README.md +102 -0
  9. dalla_data_processing-0.0.3/dalla_data_processing/__init__.py +56 -0
  10. dalla_data_processing-0.0.3/dalla_data_processing/_version.py +34 -0
  11. dalla_data_processing-0.0.3/dalla_data_processing/cli.py +776 -0
  12. dalla_data_processing-0.0.3/dalla_data_processing/core/README.md +65 -0
  13. dalla_data_processing-0.0.3/dalla_data_processing/core/__init__.py +6 -0
  14. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/core/dataset.py +1 -4
  15. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/core/parallel.py +1 -1
  16. dalla_data_processing-0.0.3/dalla_data_processing/deduplication/README.md +82 -0
  17. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/__init__.py +4 -4
  18. dalla_data_processing-0.0.3/dalla_data_processing/deduplication/onion/src_sc/.gitignore +52 -0
  19. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion_wrapper.py +1 -1
  20. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/postprocessing.py +1 -1
  21. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/preprocessing.py +1 -1
  22. dalla_data_processing-0.0.3/dalla_data_processing/packing/README.md +244 -0
  23. dalla_data_processing-0.0.3/dalla_data_processing/packing/__init__.py +10 -0
  24. dalla_data_processing-0.0.3/dalla_data_processing/packing/dataset_packer.py +449 -0
  25. dalla_data_processing-0.0.3/dalla_data_processing/packing/pack_config.example.yaml +11 -0
  26. dalla_data_processing-0.0.3/dalla_data_processing/quality/README.md +66 -0
  27. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/quality/__init__.py +1 -1
  28. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/quality/checker.py +22 -2
  29. dalla_data_processing-0.0.3/dalla_data_processing/readability/README.md +51 -0
  30. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/readability/__init__.py +5 -5
  31. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/readability/ranking.py +1 -1
  32. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/readability/scorer.py +1 -1
  33. dalla_data_processing-0.0.3/dalla_data_processing/stemming/README.md +90 -0
  34. dalla_data_processing-0.0.3/dalla_data_processing/stemming/__init__.py +9 -0
  35. dalla_data_processing-0.0.1/dalla/stemming/__init__.py → dalla_data_processing-0.0.3/dalla_data_processing/stemming/stemmer.py +138 -11
  36. dalla_data_processing-0.0.3/dalla_data_processing/utils/__init__.py +18 -0
  37. dalla_data_processing-0.0.3/dalla_data_processing.egg-info/PKG-INFO +147 -0
  38. dalla_data_processing-0.0.3/dalla_data_processing.egg-info/SOURCES.txt +78 -0
  39. dalla_data_processing-0.0.3/dalla_data_processing.egg-info/entry_points.txt +2 -0
  40. {dalla_data_processing-0.0.1 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/requires.txt +17 -5
  41. dalla_data_processing-0.0.3/dalla_data_processing.egg-info/top_level.txt +1 -0
  42. {dalla_data_processing-0.0.1 → dalla_data_processing-0.0.3}/pyproject.toml +39 -19
  43. dalla_data_processing-0.0.3/scripts/build_onion.sh +114 -0
  44. dalla_data_processing-0.0.3/scripts/release.sh +91 -0
  45. dalla_data_processing-0.0.3/uv.lock +2211 -0
  46. dalla_data_processing-0.0.1/MANIFEST.in +0 -8
  47. dalla_data_processing-0.0.1/PKG-INFO +0 -393
  48. dalla_data_processing-0.0.1/README.md +0 -355
  49. dalla_data_processing-0.0.1/dalla/__init__.py +0 -27
  50. dalla_data_processing-0.0.1/dalla/cli.py +0 -453
  51. dalla_data_processing-0.0.1/dalla/core/__init__.py +0 -6
  52. dalla_data_processing-0.0.1/dalla/utils/__init__.py +0 -10
  53. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/PKG-INFO +0 -393
  54. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/SOURCES.txt +0 -58
  55. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/entry_points.txt +0 -2
  56. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/top_level.txt +0 -1
  57. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/bin/.gitignore +0 -0
  58. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/COPYING +0 -0
  59. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/Makefile +0 -0
  60. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/Makefile.config +0 -0
  61. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/README.md +0 -0
  62. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/Makefile +0 -0
  63. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/Makefile.g +0 -0
  64. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/buzhash.c +0 -0
  65. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/buzhash.h +0 -0
  66. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/hashdup.c +0 -0
  67. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/hashgen.c +0 -0
  68. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/onion +0 -0
  69. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/onion.c +0 -0
  70. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/onion_dup.c +0 -0
  71. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/version.c +0 -0
  72. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src/version.h +0 -0
  73. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/Makefile +0 -0
  74. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/Makefile.g +0 -0
  75. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/buzhash.c +0 -0
  76. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/buzhash.h +0 -0
  77. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/hashdup +0 -0
  78. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/hashdup.c +0 -0
  79. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/hashgen +0 -0
  80. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/hashgen.c +0 -0
  81. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/onion.c +0 -0
  82. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/onion_dup.c +0 -0
  83. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/version.c +0 -0
  84. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/deduplication/onion/src_sc/version.h +0 -0
  85. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/stemming/data/words_al.txt +0 -0
  86. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/stemming/data/words_al_t.txt +0 -0
  87. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/stemming/data/words_t.txt +0 -0
  88. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/utils/logger.py +0 -0
  89. {dalla_data_processing-0.0.1/dalla → dalla_data_processing-0.0.3/dalla_data_processing}/utils/tokenize.py +0 -0
  90. {dalla_data_processing-0.0.1 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
  91. {dalla_data_processing-0.0.1 → dalla_data_processing-0.0.3}/setup.cfg +0 -0
@@ -0,0 +1,63 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ venv/
25
+ env/
26
+ ENV/
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+ *.swp
32
+ *.swo
33
+ *~
34
+
35
+ # Testing
36
+ .pytest_cache/
37
+ .coverage
38
+ htmlcov/
39
+ .tox/
40
+
41
+ # Git
42
+ .git/
43
+ .gitignore
44
+
45
+ # Documentation
46
+ docs/
47
+ *.md
48
+ !README.md
49
+
50
+ # CI/CD
51
+ .github/
52
+
53
+ # Temporary
54
+ tmp/
55
+ temp/
56
+ *.tmp
57
+
58
+ # Data files (users should mount these)
59
+ *.arrow
60
+ *.parquet
61
+ *.csv
62
+ *.json
63
+ data/
@@ -0,0 +1,116 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, develop ]
6
+ pull_request:
7
+ branches: [ main, develop ]
8
+
9
+ jobs:
10
+ test:
11
+ name: Test Python ${{ matrix.python-version }} on ${{ matrix.os }}
12
+ runs-on: ${{ matrix.os }}
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ os: [ubuntu-latest, macos-latest]
17
+ python-version: ['3.12']
18
+
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ with:
22
+ fetch-depth: 0 # Fetch full history for setuptools-scm
23
+
24
+ - name: Set up Python ${{ matrix.python-version }}
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version: ${{ matrix.python-version }}
28
+
29
+ - name: Install UV
30
+ run: |
31
+ curl -LsSf https://astral.sh/uv/install.sh | sh
32
+ echo "$HOME/.cargo/bin" >> $GITHUB_PATH
33
+
34
+ - name: Install dependencies
35
+ run: |
36
+ uv pip install --system -e ".[dev]"
37
+
38
+ - name: Lint with ruff
39
+ run: |
40
+ ruff check dalla_data_processing/ --output-format=github
41
+ ruff format --check dalla_data_processing/
42
+
43
+ - name: Upload coverage
44
+ if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
45
+ uses: codecov/codecov-action@v3
46
+ with:
47
+ files: ./coverage.xml
48
+ fail_ci_if_error: false
49
+
50
+ build-onion:
51
+ name: Build Onion Binary for ${{ matrix.os }}
52
+ runs-on: ${{ matrix.os }}
53
+ strategy:
54
+ matrix:
55
+ include:
56
+ - os: ubuntu-latest
57
+ artifact_name: onion-linux-x86_64
58
+ - os: macos-latest
59
+ artifact_name: onion-darwin-x86_64
60
+
61
+ steps:
62
+ - uses: actions/checkout@v4
63
+ with:
64
+ fetch-depth: 0 # Fetch full history for setuptools-scm
65
+
66
+ - name: Install build dependencies
67
+ run: |
68
+ if [ "$RUNNER_OS" == "Linux" ]; then
69
+ sudo apt-get update
70
+ sudo apt-get install -y build-essential libsparsehash-dev
71
+ elif [ "$RUNNER_OS" == "macOS" ]; then
72
+ brew install google-sparsehash
73
+ fi
74
+
75
+ - name: Build Onion
76
+ run: |
77
+ chmod +x scripts/build_onion.sh
78
+ ./scripts/build_onion.sh
79
+
80
+ - name: Upload Onion Binary
81
+ uses: actions/upload-artifact@v4
82
+ with:
83
+ name: ${{ matrix.artifact_name }}
84
+ path: dalla_data_processing/deduplication/bin/onion-*
85
+ if-no-files-found: error
86
+
87
+ quality-checks:
88
+ name: Code Quality
89
+ runs-on: ubuntu-latest
90
+
91
+ steps:
92
+ - uses: actions/checkout@v4
93
+ with:
94
+ fetch-depth: 0 # Fetch full history for setuptools-scm
95
+
96
+ - name: Set up Python
97
+ uses: actions/setup-python@v5
98
+ with:
99
+ python-version: '3.12'
100
+
101
+ - name: Install UV
102
+ run: |
103
+ curl -LsSf https://astral.sh/uv/install.sh | sh
104
+ echo "$HOME/.cargo/bin" >> $GITHUB_PATH
105
+
106
+ - name: Install pre-commit
107
+ run: |
108
+ uv pip install --system pre-commit
109
+
110
+ - name: Clean pre-commit cache
111
+ run: |
112
+ pre-commit clean
113
+
114
+ - name: Run pre-commit hooks
115
+ run: |
116
+ pre-commit run --all-files
@@ -0,0 +1,253 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*.*.*'
7
+
8
+ permissions:
9
+ contents: write
10
+
11
+ jobs:
12
+ build-onion:
13
+ name: Build Onion Binary for ${{ matrix.os }}
14
+ runs-on: ${{ matrix.os }}
15
+ strategy:
16
+ matrix:
17
+ include:
18
+ - os: ubuntu-latest
19
+ artifact_name: onion-linux-x86_64
20
+ - os: macos-latest
21
+ artifact_name: onion-darwin-x86_64
22
+
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ with:
26
+ fetch-depth: 0 # Fetch full history for setuptools-scm
27
+
28
+ - name: Install build dependencies
29
+ run: |
30
+ if [ "$RUNNER_OS" == "Linux" ]; then
31
+ sudo apt-get update
32
+ sudo apt-get install -y build-essential libsparsehash-dev
33
+ elif [ "$RUNNER_OS" == "macOS" ]; then
34
+ brew install google-sparsehash
35
+ fi
36
+
37
+ - name: Build Onion
38
+ run: |
39
+ chmod +x scripts/build_onion.sh
40
+ ./scripts/build_onion.sh
41
+
42
+ - name: Upload Onion Binary
43
+ uses: actions/upload-artifact@v4
44
+ with:
45
+ name: ${{ matrix.artifact_name }}
46
+ path: dalla_data_processing/deduplication/bin/onion-*
47
+ if-no-files-found: error
48
+
49
+ build-wheels:
50
+ name: Build wheels on ${{ matrix.os }}
51
+ needs: build-onion
52
+ runs-on: ${{ matrix.os }}
53
+ strategy:
54
+ matrix:
55
+ include:
56
+ - os: ubuntu-latest
57
+ onion_artifact: onion-linux-x86_64
58
+ - os: macos-latest
59
+ onion_artifact: onion-darwin-x86_64
60
+ - os: windows-latest
61
+ onion_artifact: none
62
+
63
+ steps:
64
+ - uses: actions/checkout@v4
65
+ with:
66
+ fetch-depth: 0 # Fetch full history for setuptools-scm
67
+
68
+ - name: Create bin directory
69
+ run: mkdir -p dalla_data_processing/deduplication/bin
70
+ shell: bash
71
+
72
+ - name: Download Onion binary
73
+ if: matrix.onion_artifact != 'none'
74
+ uses: actions/download-artifact@v4
75
+ with:
76
+ name: ${{ matrix.onion_artifact }}
77
+ path: dalla_data_processing/deduplication/bin/
78
+
79
+ - name: Set binary permissions
80
+ if: matrix.onion_artifact != 'none'
81
+ run: chmod +x dalla_data_processing/deduplication/bin/onion-*
82
+ shell: bash
83
+
84
+ - name: List binary files
85
+ if: matrix.onion_artifact != 'none'
86
+ run: ls -lah dalla_data_processing/deduplication/bin/
87
+ shell: bash
88
+
89
+ - name: Set up Python
90
+ uses: actions/setup-python@v5
91
+ with:
92
+ python-version: '3.12'
93
+
94
+ - name: Install build dependencies
95
+ run: |
96
+ python -m pip install --upgrade pip
97
+ pip install build twine
98
+
99
+ - name: Build wheel
100
+ run: python -m build
101
+
102
+ - name: List wheel contents
103
+ run: |
104
+ python -m zipfile -l dist/*.whl | head -50
105
+
106
+ - name: Upload wheels
107
+ uses: actions/upload-artifact@v4
108
+ with:
109
+ name: wheels-${{ matrix.os }}
110
+ path: dist/*
111
+
112
+ build-sdist:
113
+ name: Build source distribution
114
+ runs-on: ubuntu-latest
115
+
116
+ steps:
117
+ - uses: actions/checkout@v4
118
+ with:
119
+ fetch-depth: 0 # Fetch full history for setuptools-scm
120
+
121
+ - name: Set up Python
122
+ uses: actions/setup-python@v5
123
+ with:
124
+ python-version: '3.12'
125
+
126
+ - name: Build sdist
127
+ run: |
128
+ pip install build
129
+ python -m build --sdist
130
+
131
+ - name: Upload sdist
132
+ uses: actions/upload-artifact@v4
133
+ with:
134
+ name: sdist
135
+ path: dist/*.tar.gz
136
+
137
+ publish-pypi:
138
+ name: Publish to PyPI
139
+ needs: [build-wheels, build-sdist]
140
+ runs-on: ubuntu-latest
141
+ environment: release
142
+
143
+ steps:
144
+ # Only use Linux wheel to avoid conflicts
145
+ - uses: actions/download-artifact@v4
146
+ with:
147
+ name: wheels-ubuntu-latest
148
+ path: dist/
149
+
150
+ - uses: actions/download-artifact@v4
151
+ with:
152
+ name: sdist
153
+ path: dist/
154
+
155
+ - name: List distribution files
156
+ run: ls -lh dist/
157
+
158
+ - name: Verify wheel integrity
159
+ run: |
160
+ pip install twine
161
+ twine check dist/*
162
+
163
+ - name: Publish to PyPI
164
+ uses: pypa/gh-action-pypi-publish@release/v1
165
+ with:
166
+ password: ${{ secrets.PYPI_API_TOKEN }}
167
+ skip-existing: true
168
+
169
+ create-release:
170
+ name: Create GitHub Release
171
+ needs: [build-wheels, build-sdist]
172
+ runs-on: ubuntu-latest
173
+
174
+ steps:
175
+ - uses: actions/checkout@v4
176
+ with:
177
+ fetch-depth: 0 # Fetch full history for setuptools-scm
178
+
179
+ - uses: actions/download-artifact@v4
180
+ with:
181
+ pattern: wheels-*
182
+ merge-multiple: true
183
+ path: dist/
184
+
185
+ - uses: actions/download-artifact@v4
186
+ with:
187
+ name: sdist
188
+ path: dist/
189
+
190
+ - name: Extract version from tag
191
+ id: version
192
+ run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
193
+
194
+ - name: Generate release notes
195
+ id: notes
196
+ run: |
197
+ # Extract changelog for this version
198
+ VERSION=${{ steps.version.outputs.VERSION }}
199
+ echo "NOTES<<EOF" >> $GITHUB_OUTPUT
200
+ echo "## Dalla Data Processing v${VERSION}" >> $GITHUB_OUTPUT
201
+ echo "" >> $GITHUB_OUTPUT
202
+ echo "### Installation" >> $GITHUB_OUTPUT
203
+ echo "\`\`\`bash" >> $GITHUB_OUTPUT
204
+ echo "pip install dalla-data-processing==${VERSION}" >> $GITHUB_OUTPUT
205
+ echo "\`\`\`" >> $GITHUB_OUTPUT
206
+ echo "" >> $GITHUB_OUTPUT
207
+ echo "### What's Changed" >> $GITHUB_OUTPUT
208
+ # TODO: Add actual changelog extraction
209
+ echo "See CHANGELOG.md for details" >> $GITHUB_OUTPUT
210
+ echo "EOF" >> $GITHUB_OUTPUT
211
+
212
+ - name: Create Release
213
+ uses: softprops/action-gh-release@v1
214
+ with:
215
+ files: dist/*
216
+ body: ${{ steps.notes.outputs.NOTES }}
217
+ draft: false
218
+ prerelease: false
219
+ env:
220
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
221
+
222
+ # Commented out for now - uncomment when ready to publish Docker images
223
+ # build-docker:
224
+ # name: Build and Push Docker Image
225
+ # needs: [publish-pypi]
226
+ # runs-on: ubuntu-latest
227
+ #
228
+ # steps:
229
+ # - uses: actions/checkout@v4
230
+ #
231
+ # - name: Set up Docker Buildx
232
+ # uses: docker/setup-buildx-action@v3
233
+ #
234
+ # - name: Log in to Docker Hub
235
+ # uses: docker/login-action@v3
236
+ # with:
237
+ # username: ${{ secrets.DOCKER_USERNAME }}
238
+ # password: ${{ secrets.DOCKER_PASSWORD }}
239
+ #
240
+ # - name: Extract version
241
+ # id: version
242
+ # run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
243
+ #
244
+ # - name: Build and push
245
+ # uses: docker/build-push-action@v5
246
+ # with:
247
+ # context: .
248
+ # push: true
249
+ # tags: |
250
+ # ${{ secrets.DOCKER_USERNAME }}/dalla-data-processing:latest
251
+ # ${{ secrets.DOCKER_USERNAME }}/dalla-data-processing:${{ steps.version.outputs.VERSION }}
252
+ # cache-from: type=gha
253
+ # cache-to: type=gha,mode=max
@@ -0,0 +1,69 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # setuptools-scm version file
24
+ dalla_data_processing/_version.py
25
+
26
+ # Virtual environments
27
+ venv/
28
+ env/
29
+ ENV/
30
+ .venv
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # Testing
40
+ .pytest_cache/
41
+ .coverage
42
+ htmlcov/
43
+ .tox/
44
+
45
+ # Type checking
46
+ .mypy_cache/
47
+ .pytype/
48
+ .pyre/
49
+
50
+ # Distribution
51
+ *.whl
52
+
53
+ # Native binaries
54
+ *.o
55
+ *.a
56
+ *.dylib
57
+ *.dll
58
+
59
+ # Data
60
+ *.arrow
61
+ *.parquet
62
+
63
+ # Logs
64
+ *.log
65
+ logs/
66
+
67
+ # OS
68
+ .DS_Store
69
+ Thumbs.db
@@ -0,0 +1,16 @@
1
+ # Pre-commit hooks for dalla-data-processing
2
+ # Install: pre-commit install
3
+ # Run manually: pre-commit run --all-files
4
+
5
+ repos:
6
+ # Ruff - Fast Python linter and formatter
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.1.15
9
+ hooks:
10
+ - id: ruff
11
+ args: [--fix, --exit-non-zero-on-fix]
12
+ - id: ruff-format
13
+
14
+ # Configuration
15
+ default_language_version:
16
+ python: python3.12
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include dalla_data_processing *.py
4
+ recursive-include dalla_data_processing/stemming/data *.txt
5
+ recursive-include dalla_data_processing/deduplication/bin *
6
+ recursive-include dalla_data_processing/deduplication/onion *.c *.h Makefile*
7
+ global-exclude __pycache__
8
+ global-exclude *.py[co]
@@ -0,0 +1,147 @@
1
+ Metadata-Version: 2.4
2
+ Name: dalla-data-processing
3
+ Version: 0.0.3
4
+ Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
+ Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
+ Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
7
+ Project-URL: Documentation, https://github.com/U4RASD/dalla-data-processing#readme
8
+ Project-URL: Repository, https://github.com/U4RASD/dalla-data-processing
9
+ Project-URL: Bug Tracker, https://github.com/U4RASD/dalla-data-processing/issues
10
+ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Requires-Python: <3.13,>=3.12
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: datasets>=2.14.0
20
+ Requires-Dist: transformers>=4.30.0
21
+ Requires-Dist: click>=8.0.0
22
+ Requires-Dist: tqdm>=4.65.0
23
+ Requires-Dist: pyarrow>=12.0.0
24
+ Requires-Dist: structlog>=24.0.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
28
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
29
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
30
+ Provides-Extra: dedup
31
+ Requires-Dist: camel-tools==1.5.7; extra == "dedup"
32
+ Provides-Extra: dedup-native
33
+ Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
34
+ Provides-Extra: stem
35
+ Requires-Dist: camel-tools==1.5.7; extra == "stem"
36
+ Provides-Extra: quality
37
+ Requires-Dist: camel-tools==1.5.7; extra == "quality"
38
+ Provides-Extra: readability
39
+ Requires-Dist: textstat>=0.7.0; extra == "readability"
40
+ Provides-Extra: pack
41
+ Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
42
+ Requires-Dist: pyyaml; extra == "pack"
43
+ Provides-Extra: all
44
+ Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
45
+
46
+ # Dalla Data Processing (dalla-dp)
47
+
48
+ A comprehensive Arabic data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models.
49
+
50
+ ## Compatibility
51
+
52
+ - **Linux**: Fully supported
53
+ - **macOS**: Fully supported (Intel or through rosetta)
54
+ - **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
55
+
56
+ ## Installation
57
+
58
+ ### Quick Start (All Features)
59
+
60
+ For most users, install with all features enabled:
61
+
62
+ <b>Using uv</b>
63
+
64
+ ```bash
65
+ uv pip install "dalla-data-processing[all]"
66
+ ```
67
+
68
+ <b>Using pip</b>
69
+
70
+ ```bash
71
+ pip install "dalla-data-processing[all]"
72
+ ```
73
+
74
+ ### Modular Installation (Advanced)
75
+
76
+ Install only the components you need to keep dependencies minimal:
77
+
78
+ ```bash
79
+ # Base installation (no processing features, only core dependencies)
80
+ pip install dalla-data-processing
81
+
82
+ # Install specific features
83
+ pip install "dalla-data-processing[dedup]" # Deduplication only
84
+ pip install "dalla-data-processing[stem]" # Stemming only
85
+ pip install "dalla-data-processing[quality]" # Quality checking only
86
+ pip install "dalla-data-processing[readability]" # Readability scoring only
87
+ pip install "dalla-data-processing[pack]" # Dataset packing only
88
+
89
+ # Combine multiple features
90
+ pip install "dalla-data-processing[dedup,stem,quality]"
91
+ ```
92
+
93
+ ### Development Installation
94
+
95
+ <b>From Source (with uv - recommended)</b>
96
+
97
+ ```bash
98
+ git clone https://github.com/U4RASD/dalla-data-processing.git
99
+ cd dalla-data-processing
100
+
101
+ # Install all features and dev dependencies
102
+ uv sync --all-extras
103
+
104
+ # Or install with specific extras only
105
+ uv sync --extra dedup --extra stem
106
+ ```
107
+
108
+ <b>From Source (with pip)</b>
109
+
110
+ ```bash
111
+ git clone https://github.com/U4RASD/dalla-data-processing.git
112
+ cd dalla-data-processing
113
+
114
+ # Install with all features for development
115
+ pip install -e ".[all,dev]"
116
+ ```
117
+
118
+ ## Components
119
+
120
+ > **Note:** Each component requires its corresponding extra to be installed. Install with `[all]` to enable all features, or see [Modular Installation](#modular-installation-advanced) to install only what you need.
121
+
122
+ ### 1. [Deduplication](dalla_data_processing/deduplication/README.md)
123
+ Detect and remove duplicate or near-duplicate documents from your datasets using the Onion algorithm.
124
+ - **Requires:** `[dedup]` extra
125
+
126
+ ### 2. [Stemming](dalla_data_processing/stemming/README.md)
127
+ Apply morphological analysis and stemming using CAMeL Tools.
128
+ - **Requires:** `[stem]` extra
129
+
130
+ ### 3. [Quality Checking](dalla_data_processing/quality/README.md)
131
+ Check text quality using morphological analysis to detect errors and foreign words.
132
+ - **Requires:** `[quality]` extra
133
+
134
+ ### 4. [Readability Scoring](dalla_data_processing/readability/README.md)
135
+ Calculate readability scores using Flesch Reading Ease and Osman methods.
136
+ Contains also ranking according to both scores
137
+ - **Requires:** `[readability]` extra
138
+
139
+ ### 5. [Dataset Packing](dalla_data_processing/packing/README.md)
140
+ Pack and prepare datasets for training.
141
+ - **Requires:** `[pack]` extra
142
+
143
+ ## Links
144
+
145
+ - Homepage: https://github.com/U4RASD/dalla-data-processing
146
+ - Issues: https://github.com/U4RASD/dalla-data-processing/issues
147
+ - Documentation: https://github.com/U4RASD/dalla-data-processing#readme