dalla-data-processing 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/.github/workflows/ci.yml +3 -1
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/.github/workflows/release.yml +33 -32
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/PKG-INFO +5 -2
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/_version.py +3 -3
- {dalla_data_processing-0.0.3/dalla_data_processing/deduplication/onion/src_sc → dalla_data_processing-0.0.4/dalla_data_processing/deduplication/onion/src}/Makefile +1 -1
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/Makefile.g +1 -1
- {dalla_data_processing-0.0.3/dalla_data_processing/deduplication/onion/src → dalla_data_processing-0.0.4/dalla_data_processing/deduplication/onion/src_sc}/Makefile +1 -1
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +1 -1
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/PKG-INFO +5 -2
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/SOURCES.txt +1 -0
- dalla_data_processing-0.0.4/dalla_data_processing.egg-info/not-zip-safe +1 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/pyproject.toml +7 -2
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/scripts/build_onion.sh +4 -2
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/.dockerignore +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/.gitignore +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/.pre-commit-config.yaml +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/MANIFEST.in +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/cli.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/dataset.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/parallel.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/COPYING +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/Makefile +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/postprocessing.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/preprocessing.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/dataset_packer.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/checker.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/ranking.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/scorer.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/README.md +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_al.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_t.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/stemmer.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/__init__.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/logger.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/tokenize.py +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/entry_points.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/requires.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/top_level.txt +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/scripts/release.sh +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/setup.cfg +0 -0
- {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/uv.lock +0 -0
|
@@ -55,8 +55,10 @@ jobs:
|
|
|
55
55
|
include:
|
|
56
56
|
- os: ubuntu-latest
|
|
57
57
|
artifact_name: onion-linux-x86_64
|
|
58
|
-
- os: macos-
|
|
58
|
+
- os: macos-13
|
|
59
59
|
artifact_name: onion-darwin-x86_64
|
|
60
|
+
- os: macos-14
|
|
61
|
+
artifact_name: onion-darwin-arm64
|
|
60
62
|
|
|
61
63
|
steps:
|
|
62
64
|
- uses: actions/checkout@v4
|
|
@@ -17,8 +17,10 @@ jobs:
|
|
|
17
17
|
include:
|
|
18
18
|
- os: ubuntu-latest
|
|
19
19
|
artifact_name: onion-linux-x86_64
|
|
20
|
-
- os: macos-
|
|
20
|
+
- os: macos-13
|
|
21
21
|
artifact_name: onion-darwin-x86_64
|
|
22
|
+
- os: macos-14
|
|
23
|
+
artifact_name: onion-darwin-arm64
|
|
22
24
|
|
|
23
25
|
steps:
|
|
24
26
|
- uses: actions/checkout@v4
|
|
@@ -46,19 +48,10 @@ jobs:
|
|
|
46
48
|
path: dalla_data_processing/deduplication/bin/onion-*
|
|
47
49
|
if-no-files-found: error
|
|
48
50
|
|
|
49
|
-
build-
|
|
50
|
-
name: Build
|
|
51
|
+
build-wheel:
|
|
52
|
+
name: Build universal wheel with all platform binaries
|
|
51
53
|
needs: build-onion
|
|
52
|
-
runs-on:
|
|
53
|
-
strategy:
|
|
54
|
-
matrix:
|
|
55
|
-
include:
|
|
56
|
-
- os: ubuntu-latest
|
|
57
|
-
onion_artifact: onion-linux-x86_64
|
|
58
|
-
- os: macos-latest
|
|
59
|
-
onion_artifact: onion-darwin-x86_64
|
|
60
|
-
- os: windows-latest
|
|
61
|
-
onion_artifact: none
|
|
54
|
+
runs-on: ubuntu-latest
|
|
62
55
|
|
|
63
56
|
steps:
|
|
64
57
|
- uses: actions/checkout@v4
|
|
@@ -67,24 +60,30 @@ jobs:
|
|
|
67
60
|
|
|
68
61
|
- name: Create bin directory
|
|
69
62
|
run: mkdir -p dalla_data_processing/deduplication/bin
|
|
70
|
-
shell: bash
|
|
71
63
|
|
|
72
|
-
- name: Download
|
|
73
|
-
|
|
64
|
+
- name: Download Linux binary
|
|
65
|
+
uses: actions/download-artifact@v4
|
|
66
|
+
with:
|
|
67
|
+
name: onion-linux-x86_64
|
|
68
|
+
path: dalla_data_processing/deduplication/bin/
|
|
69
|
+
|
|
70
|
+
- name: Download macOS Intel binary
|
|
71
|
+
uses: actions/download-artifact@v4
|
|
72
|
+
with:
|
|
73
|
+
name: onion-darwin-x86_64
|
|
74
|
+
path: dalla_data_processing/deduplication/bin/
|
|
75
|
+
|
|
76
|
+
- name: Download macOS ARM binary
|
|
74
77
|
uses: actions/download-artifact@v4
|
|
75
78
|
with:
|
|
76
|
-
name:
|
|
79
|
+
name: onion-darwin-arm64
|
|
77
80
|
path: dalla_data_processing/deduplication/bin/
|
|
78
81
|
|
|
79
82
|
- name: Set binary permissions
|
|
80
|
-
if: matrix.onion_artifact != 'none'
|
|
81
83
|
run: chmod +x dalla_data_processing/deduplication/bin/onion-*
|
|
82
|
-
shell: bash
|
|
83
84
|
|
|
84
|
-
- name: List
|
|
85
|
-
if: matrix.onion_artifact != 'none'
|
|
85
|
+
- name: List all binaries
|
|
86
86
|
run: ls -lah dalla_data_processing/deduplication/bin/
|
|
87
|
-
shell: bash
|
|
88
87
|
|
|
89
88
|
- name: Set up Python
|
|
90
89
|
uses: actions/setup-python@v5
|
|
@@ -101,13 +100,16 @@ jobs:
|
|
|
101
100
|
|
|
102
101
|
- name: List wheel contents
|
|
103
102
|
run: |
|
|
104
|
-
|
|
103
|
+
echo "=== Wheel contents (binaries) ==="
|
|
104
|
+
python -m zipfile -l dist/*.whl | grep onion-
|
|
105
|
+
echo "=== Wheel info ==="
|
|
106
|
+
ls -lh dist/
|
|
105
107
|
|
|
106
|
-
- name: Upload
|
|
108
|
+
- name: Upload wheel
|
|
107
109
|
uses: actions/upload-artifact@v4
|
|
108
110
|
with:
|
|
109
|
-
name:
|
|
110
|
-
path: dist
|
|
111
|
+
name: wheel
|
|
112
|
+
path: dist/*.whl
|
|
111
113
|
|
|
112
114
|
build-sdist:
|
|
113
115
|
name: Build source distribution
|
|
@@ -136,15 +138,15 @@ jobs:
|
|
|
136
138
|
|
|
137
139
|
publish-pypi:
|
|
138
140
|
name: Publish to PyPI
|
|
139
|
-
needs: [build-
|
|
141
|
+
needs: [build-wheel, build-sdist]
|
|
140
142
|
runs-on: ubuntu-latest
|
|
141
143
|
environment: release
|
|
142
144
|
|
|
143
145
|
steps:
|
|
144
|
-
#
|
|
146
|
+
# Download the universal wheel with all platform binaries
|
|
145
147
|
- uses: actions/download-artifact@v4
|
|
146
148
|
with:
|
|
147
|
-
name:
|
|
149
|
+
name: wheel
|
|
148
150
|
path: dist/
|
|
149
151
|
|
|
150
152
|
- uses: actions/download-artifact@v4
|
|
@@ -168,7 +170,7 @@ jobs:
|
|
|
168
170
|
|
|
169
171
|
create-release:
|
|
170
172
|
name: Create GitHub Release
|
|
171
|
-
needs: [build-
|
|
173
|
+
needs: [build-wheel, build-sdist]
|
|
172
174
|
runs-on: ubuntu-latest
|
|
173
175
|
|
|
174
176
|
steps:
|
|
@@ -178,8 +180,7 @@ jobs:
|
|
|
178
180
|
|
|
179
181
|
- uses: actions/download-artifact@v4
|
|
180
182
|
with:
|
|
181
|
-
|
|
182
|
-
merge-multiple: true
|
|
183
|
+
name: wheel
|
|
183
184
|
path: dist/
|
|
184
185
|
|
|
185
186
|
- uses: actions/download-artifact@v4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
|
|
@@ -11,10 +11,13 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
18
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
19
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
-
Requires-Python:
|
|
20
|
+
Requires-Python: >=3.10
|
|
18
21
|
Description-Content-Type: text/markdown
|
|
19
22
|
Requires-Dist: datasets>=2.14.0
|
|
20
23
|
Requires-Dist: transformers>=4.30.0
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/_version.py
RENAMED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 0,
|
|
31
|
+
__version__ = version = '0.0.4'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 0, 4)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g89939c5cc'
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
|
|
@@ -11,10 +11,13 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
18
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
19
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
-
Requires-Python:
|
|
20
|
+
Requires-Python: >=3.10
|
|
18
21
|
Description-Content-Type: text/markdown
|
|
19
22
|
Requires-Dist: datasets>=2.14.0
|
|
20
23
|
Requires-Dist: transformers>=4.30.0
|
|
@@ -14,6 +14,7 @@ dalla_data_processing.egg-info/PKG-INFO
|
|
|
14
14
|
dalla_data_processing.egg-info/SOURCES.txt
|
|
15
15
|
dalla_data_processing.egg-info/dependency_links.txt
|
|
16
16
|
dalla_data_processing.egg-info/entry_points.txt
|
|
17
|
+
dalla_data_processing.egg-info/not-zip-safe
|
|
17
18
|
dalla_data_processing.egg-info/requires.txt
|
|
18
19
|
dalla_data_processing.egg-info/top_level.txt
|
|
19
20
|
dalla_data_processing/core/README.md
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -11,13 +11,16 @@ authors = [
|
|
|
11
11
|
{name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
|
|
12
12
|
]
|
|
13
13
|
readme = "README.md"
|
|
14
|
-
requires-python = ">=3.
|
|
14
|
+
requires-python = ">=3.10"
|
|
15
15
|
keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Intended Audience :: Developers",
|
|
18
18
|
"Intended Audience :: Science/Research",
|
|
19
19
|
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
20
22
|
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
21
24
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
25
|
"Topic :: Text Processing :: Linguistic",
|
|
23
26
|
]
|
|
@@ -76,6 +79,8 @@ Repository = "https://github.com/U4RASD/dalla-data-processing"
|
|
|
76
79
|
packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.packing", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
|
|
77
80
|
include-package-data = true
|
|
78
81
|
|
|
82
|
+
zip-safe = false
|
|
83
|
+
|
|
79
84
|
[tool.setuptools.package-data]
|
|
80
85
|
dalla_data_processing = ["py.typed"]
|
|
81
86
|
"dalla_data_processing.stemming" = ["data/*.txt"]
|
|
@@ -83,7 +88,7 @@ dalla_data_processing = ["py.typed"]
|
|
|
83
88
|
|
|
84
89
|
[tool.ruff]
|
|
85
90
|
line-length = 100
|
|
86
|
-
target-version = "
|
|
91
|
+
target-version = "py310"
|
|
87
92
|
src = ["dalla_data_processing"]
|
|
88
93
|
|
|
89
94
|
[tool.ruff.lint]
|
|
@@ -63,14 +63,16 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
|
63
63
|
# On macOS, add Homebrew paths for sparsehash
|
|
64
64
|
if command -v brew &> /dev/null; then
|
|
65
65
|
BREW_PREFIX=$(brew --prefix)
|
|
66
|
-
|
|
66
|
+
SPARSEHASH_PREFIX=$(brew --prefix google-sparsehash 2>/dev/null || echo "${BREW_PREFIX}")
|
|
67
|
+
EXTRA_CFLAGS="-I${SPARSEHASH_PREFIX}/include"
|
|
67
68
|
echo -e "${YELLOW}Using Homebrew prefix: ${BREW_PREFIX}${NC}"
|
|
69
|
+
echo -e "${YELLOW}Using sparsehash include: ${SPARSEHASH_PREFIX}/include${NC}"
|
|
68
70
|
fi
|
|
69
71
|
fi
|
|
70
72
|
|
|
71
73
|
# Build onion
|
|
72
74
|
echo -e "${YELLOW}Compiling Onion...${NC}"
|
|
73
|
-
if make CFLAGS="-Wall -O3 ${EXTRA_CFLAGS}"; then
|
|
75
|
+
if make CFLAGS="-Wall -O3 -std=c++11 ${EXTRA_CFLAGS}"; then
|
|
74
76
|
echo -e "${GREEN}✓ Compilation successful${NC}"
|
|
75
77
|
else
|
|
76
78
|
echo -e "${RED}✗ Compilation failed${NC}"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/README.md
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/dataset.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/core/parallel.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/README.md
RENAMED
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/logger.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.3 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/tokenize.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|