dalla-data-processing 0.0.3__tar.gz → 0.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/.github/workflows/ci.yml +2 -2
  2. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/.github/workflows/release.yml +28 -35
  3. dalla_data_processing-0.0.10/LICENSE +353 -0
  4. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/PKG-INFO +8 -2
  5. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/_version.py +3 -3
  6. {dalla_data_processing-0.0.3/dalla_data_processing/deduplication/onion/src_sc → dalla_data_processing-0.0.10/dalla_data_processing/deduplication/onion/src}/Makefile +1 -1
  7. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/Makefile.g +1 -1
  8. {dalla_data_processing-0.0.3/dalla_data_processing/deduplication/onion/src → dalla_data_processing-0.0.10/dalla_data_processing/deduplication/onion/src_sc}/Makefile +1 -1
  9. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +1 -1
  10. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing.egg-info/PKG-INFO +8 -2
  11. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing.egg-info/SOURCES.txt +2 -0
  12. dalla_data_processing-0.0.10/dalla_data_processing.egg-info/not-zip-safe +1 -0
  13. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/pyproject.toml +9 -3
  14. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/scripts/build_onion.sh +22 -2
  15. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/.dockerignore +0 -0
  16. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/.gitignore +0 -0
  17. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/.pre-commit-config.yaml +0 -0
  18. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/MANIFEST.in +0 -0
  19. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/README.md +0 -0
  20. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/__init__.py +0 -0
  21. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/cli.py +0 -0
  22. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/core/README.md +0 -0
  23. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/core/__init__.py +0 -0
  24. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/core/dataset.py +0 -0
  25. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/core/parallel.py +0 -0
  26. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/README.md +0 -0
  27. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/__init__.py +0 -0
  28. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
  29. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/COPYING +0 -0
  30. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/Makefile +0 -0
  31. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
  32. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/README.md +0 -0
  33. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
  34. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
  35. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
  36. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
  37. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/onion +0 -0
  38. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
  39. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
  40. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
  41. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
  42. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
  43. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
  44. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
  45. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
  46. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
  47. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
  48. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
  49. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
  50. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
  51. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
  52. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
  53. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
  54. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/postprocessing.py +0 -0
  55. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/deduplication/preprocessing.py +0 -0
  56. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/packing/README.md +0 -0
  57. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/packing/__init__.py +0 -0
  58. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/packing/dataset_packer.py +0 -0
  59. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
  60. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/quality/README.md +0 -0
  61. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/quality/__init__.py +0 -0
  62. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/quality/checker.py +0 -0
  63. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/readability/README.md +0 -0
  64. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/readability/__init__.py +0 -0
  65. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/readability/ranking.py +0 -0
  66. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/readability/scorer.py +0 -0
  67. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/stemming/README.md +0 -0
  68. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/stemming/__init__.py +0 -0
  69. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/stemming/data/words_al.txt +0 -0
  70. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
  71. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/stemming/data/words_t.txt +0 -0
  72. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/stemming/stemmer.py +0 -0
  73. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/utils/__init__.py +0 -0
  74. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/utils/logger.py +0 -0
  75. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing/utils/tokenize.py +0 -0
  76. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
  77. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing.egg-info/entry_points.txt +0 -0
  78. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing.egg-info/requires.txt +0 -0
  79. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/dalla_data_processing.egg-info/top_level.txt +0 -0
  80. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/scripts/release.sh +0 -0
  81. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/setup.cfg +0 -0
  82. {dalla_data_processing-0.0.3 → dalla_data_processing-0.0.10}/uv.lock +0 -0
@@ -55,8 +55,8 @@ jobs:
55
55
  include:
56
56
  - os: ubuntu-latest
57
57
  artifact_name: onion-linux-x86_64
58
- - os: macos-latest
59
- artifact_name: onion-darwin-x86_64
58
+ - os: macos-14
59
+ artifact_name: onion-darwin-universal
60
60
 
61
61
  steps:
62
62
  - uses: actions/checkout@v4
@@ -17,8 +17,8 @@ jobs:
17
17
  include:
18
18
  - os: ubuntu-latest
19
19
  artifact_name: onion-linux-x86_64
20
- - os: macos-latest
21
- artifact_name: onion-darwin-x86_64
20
+ - os: macos-14
21
+ artifact_name: onion-darwin-universal
22
22
 
23
23
  steps:
24
24
  - uses: actions/checkout@v4
@@ -46,19 +46,10 @@ jobs:
46
46
  path: dalla_data_processing/deduplication/bin/onion-*
47
47
  if-no-files-found: error
48
48
 
49
- build-wheels:
50
- name: Build wheels on ${{ matrix.os }}
49
+ build-wheel:
50
+ name: Build universal wheel with all platform binaries
51
51
  needs: build-onion
52
- runs-on: ${{ matrix.os }}
53
- strategy:
54
- matrix:
55
- include:
56
- - os: ubuntu-latest
57
- onion_artifact: onion-linux-x86_64
58
- - os: macos-latest
59
- onion_artifact: onion-darwin-x86_64
60
- - os: windows-latest
61
- onion_artifact: none
52
+ runs-on: ubuntu-latest
62
53
 
63
54
  steps:
64
55
  - uses: actions/checkout@v4
@@ -67,24 +58,24 @@ jobs:
67
58
 
68
59
  - name: Create bin directory
69
60
  run: mkdir -p dalla_data_processing/deduplication/bin
70
- shell: bash
71
61
 
72
- - name: Download Onion binary
73
- if: matrix.onion_artifact != 'none'
62
+ - name: Download Linux binary
63
+ uses: actions/download-artifact@v4
64
+ with:
65
+ name: onion-linux-x86_64
66
+ path: dalla_data_processing/deduplication/bin/
67
+
68
+ - name: Download macOS universal binary
74
69
  uses: actions/download-artifact@v4
75
70
  with:
76
- name: ${{ matrix.onion_artifact }}
71
+ name: onion-darwin-universal
77
72
  path: dalla_data_processing/deduplication/bin/
78
73
 
79
74
  - name: Set binary permissions
80
- if: matrix.onion_artifact != 'none'
81
75
  run: chmod +x dalla_data_processing/deduplication/bin/onion-*
82
- shell: bash
83
76
 
84
- - name: List binary files
85
- if: matrix.onion_artifact != 'none'
77
+ - name: List all binaries
86
78
  run: ls -lah dalla_data_processing/deduplication/bin/
87
- shell: bash
88
79
 
89
80
  - name: Set up Python
90
81
  uses: actions/setup-python@v5
@@ -94,20 +85,23 @@ jobs:
94
85
  - name: Install build dependencies
95
86
  run: |
96
87
  python -m pip install --upgrade pip
97
- pip install build twine
88
+ pip install build "twine>=5.0,<6.1"
98
89
 
99
90
  - name: Build wheel
100
91
  run: python -m build
101
92
 
102
93
  - name: List wheel contents
103
94
  run: |
104
- python -m zipfile -l dist/*.whl | head -50
95
+ echo "=== Wheel contents (binaries) ==="
96
+ python -m zipfile -l dist/*.whl | grep onion-
97
+ echo "=== Wheel info ==="
98
+ ls -lh dist/
105
99
 
106
- - name: Upload wheels
100
+ - name: Upload wheel
107
101
  uses: actions/upload-artifact@v4
108
102
  with:
109
- name: wheels-${{ matrix.os }}
110
- path: dist/*
103
+ name: wheel
104
+ path: dist/*.whl
111
105
 
112
106
  build-sdist:
113
107
  name: Build source distribution
@@ -136,15 +130,15 @@ jobs:
136
130
 
137
131
  publish-pypi:
138
132
  name: Publish to PyPI
139
- needs: [build-wheels, build-sdist]
133
+ needs: [build-wheel, build-sdist]
140
134
  runs-on: ubuntu-latest
141
135
  environment: release
142
136
 
143
137
  steps:
144
- # Only use Linux wheel to avoid conflicts
138
+ # Download the universal wheel with all platform binaries
145
139
  - uses: actions/download-artifact@v4
146
140
  with:
147
- name: wheels-ubuntu-latest
141
+ name: wheel
148
142
  path: dist/
149
143
 
150
144
  - uses: actions/download-artifact@v4
@@ -157,7 +151,7 @@ jobs:
157
151
 
158
152
  - name: Verify wheel integrity
159
153
  run: |
160
- pip install twine
154
+ pip install "twine>=5.0,<6.1"
161
155
  twine check dist/*
162
156
 
163
157
  - name: Publish to PyPI
@@ -168,7 +162,7 @@ jobs:
168
162
 
169
163
  create-release:
170
164
  name: Create GitHub Release
171
- needs: [build-wheels, build-sdist]
165
+ needs: [build-wheel, build-sdist]
172
166
  runs-on: ubuntu-latest
173
167
 
174
168
  steps:
@@ -178,8 +172,7 @@ jobs:
178
172
 
179
173
  - uses: actions/download-artifact@v4
180
174
  with:
181
- pattern: wheels-*
182
- merge-multiple: true
175
+ name: wheel
183
176
  path: dist/
184
177
 
185
178
  - uses: actions/download-artifact@v4
@@ -0,0 +1,353 @@
1
+ Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
2
+
3
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
4
+ does not provide legal services or legal advice. Distribution of
5
+ Creative Commons public licenses does not create a lawyer-client or
6
+ other relationship. Creative Commons makes its licenses and related
7
+ information available on an "as-is" basis. Creative Commons gives no
8
+ warranties regarding its licenses, any material licensed under their
9
+ terms and conditions, or any related information. Creative Commons
10
+ disclaims all liability for damages resulting from their use to the
11
+ fullest extent possible.
12
+
13
+ Using Creative Commons Public Licenses
14
+
15
+ Creative Commons public licenses provide a standard set of terms and
16
+ conditions that creators and other rights holders may use to share
17
+ original works of authorship and other material subject to copyright and
18
+ certain other rights specified in the public license below. The
19
+ following considerations are for informational purposes only, are not
20
+ exhaustive, and do not form part of our licenses.
21
+
22
+ Considerations for licensors: Our public licenses are intended for use
23
+ by those authorized to give the public permission to use material in
24
+ ways otherwise restricted by copyright and certain other rights. Our
25
+ licenses are irrevocable. Licensors should read and understand the terms
26
+ and conditions of the license they choose before applying it. Licensors
27
+ should also secure all rights necessary before applying our licenses so
28
+ that the public can reuse the material as expected. Licensors should
29
+ clearly mark any material not subject to the license. This includes
30
+ other CC-licensed material, or material used under an exception or
31
+ limitation to copyright. More considerations for licensors :
32
+ wiki.creativecommons.org/Considerations\_for\_licensors
33
+
34
+ Considerations for the public: By using one of our public licenses, a
35
+ licensor grants the public permission to use the licensed material under
36
+ specified terms and conditions. If the licensor's permission is not
37
+ necessary for any reason–for example, because of any applicable
38
+ exception or limitation to copyright–then that use is not regulated by
39
+ the license. Our licenses grant only permissions under copyright and
40
+ certain other rights that a licensor has authority to grant. Use of the
41
+ licensed material may still be restricted for other reasons, including
42
+ because others have copyright or other rights in the material. A
43
+ licensor may make special requests, such as asking that all changes be
44
+ marked or described. Although not required by our licenses, you are
45
+ encouraged to respect those requests where reasonable. More
46
+ considerations for the public :
47
+ wiki.creativecommons.org/Considerations\_for\_licensees
48
+
49
+ Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
50
+ Public License
51
+
52
+ By exercising the Licensed Rights (defined below), You accept and agree
53
+ to be bound by the terms and conditions of this Creative Commons
54
+ Attribution-NonCommercial-ShareAlike 4.0 International Public License
55
+ ("Public License"). To the extent this Public License may be interpreted
56
+ as a contract, You are granted the Licensed Rights in consideration of
57
+ Your acceptance of these terms and conditions, and the Licensor grants
58
+ You such rights in consideration of benefits the Licensor receives from
59
+ making the Licensed Material available under these terms and conditions.
60
+
61
+ Section 1 – Definitions.
62
+
63
+ - a. Adapted Material means material subject to Copyright and Similar
64
+ Rights that is derived from or based upon the Licensed Material and
65
+ in which the Licensed Material is translated, altered, arranged,
66
+ transformed, or otherwise modified in a manner requiring permission
67
+ under the Copyright and Similar Rights held by the Licensor. For
68
+ purposes of this Public License, where the Licensed Material is a
69
+ musical work, performance, or sound recording, Adapted Material is
70
+ always produced where the Licensed Material is synched in timed
71
+ relation with a moving image.
72
+ - b. Adapter's License means the license You apply to Your Copyright
73
+ and Similar Rights in Your contributions to Adapted Material in
74
+ accordance with the terms and conditions of this Public License.
75
+ - c. BY-NC-SA Compatible License means a license listed at
76
+ creativecommons.org/compatiblelicenses, approved by Creative Commons
77
+ as essentially the equivalent of this Public License.
78
+ - d. Copyright and Similar Rights means copyright and/or similar
79
+ rights closely related to copyright including, without limitation,
80
+ performance, broadcast, sound recording, and Sui Generis Database
81
+ Rights, without regard to how the rights are labeled or categorized.
82
+ For purposes of this Public License, the rights specified in Section
83
+ 2(b)(1)-(2) are not Copyright and Similar Rights.
84
+ - e. Effective Technological Measures means those measures that, in
85
+ the absence of proper authority, may not be circumvented under laws
86
+ fulfilling obligations under Article 11 of the WIPO Copyright Treaty
87
+ adopted on December 20, 1996, and/or similar international
88
+ agreements.
89
+ - f. Exceptions and Limitations means fair use, fair dealing, and/or
90
+ any other exception or limitation to Copyright and Similar Rights
91
+ that applies to Your use of the Licensed Material.
92
+ - g. License Elements means the license attributes listed in the name
93
+ of a Creative Commons Public License. The License Elements of this
94
+ Public License are Attribution, NonCommercial, and ShareAlike.
95
+ - h. Licensed Material means the artistic or literary work, database,
96
+ or other material to which the Licensor applied this Public License.
97
+ - i. Licensed Rights means the rights granted to You subject to the
98
+ terms and conditions of this Public License, which are limited to
99
+ all Copyright and Similar Rights that apply to Your use of the
100
+ Licensed Material and that the Licensor has authority to license.
101
+ - j. Licensor means the individual(s) or entity(ies) granting rights
102
+ under this Public License.
103
+ - k. NonCommercial means not primarily intended for or directed
104
+ towards commercial advantage or monetary compensation. For purposes
105
+ of this Public License, the exchange of the Licensed Material for
106
+ other material subject to Copyright and Similar Rights by digital
107
+ file-sharing or similar means is NonCommercial provided there is no
108
+ payment of monetary compensation in connection with the exchange.
109
+ - l. Share means to provide material to the public by any means or
110
+ process that requires permission under the Licensed Rights, such as
111
+ reproduction, public display, public performance, distribution,
112
+ dissemination, communication, or importation, and to make material
113
+ available to the public including in ways that members of the public
114
+ may access the material from a place and at a time individually
115
+ chosen by them.
116
+ - m. Sui Generis Database Rights means rights other than copyright
117
+ resulting from Directive 96/9/EC of the European Parliament and of
118
+ the Council of 11 March 1996 on the legal protection of databases,
119
+ as amended and/or succeeded, as well as other essentially equivalent
120
+ rights anywhere in the world.
121
+ - n. You means the individual or entity exercising the Licensed Rights
122
+ under this Public License. Your has a corresponding meaning.
123
+
124
+ Section 2 – Scope.
125
+
126
+ - a. License grant.
127
+ - 1. Subject to the terms and conditions of this Public License,
128
+ the Licensor hereby grants You a worldwide, royalty-free,
129
+ non-sublicensable, non-exclusive, irrevocable license to
130
+ exercise the Licensed Rights in the Licensed Material to:
131
+ - A. reproduce and Share the Licensed Material, in whole or in
132
+ part, for NonCommercial purposes only; and
133
+ - B. produce, reproduce, and Share Adapted Material for
134
+ NonCommercial purposes only.
135
+ - 2. Exceptions and Limitations. For the avoidance of doubt, where
136
+ Exceptions and Limitations apply to Your use, this Public
137
+ License does not apply, and You do not need to comply with its
138
+ terms and conditions.
139
+ - 3. Term. The term of this Public License is specified in Section
140
+ 6(a).
141
+ - 4. Media and formats; technical modifications allowed. The
142
+ Licensor authorizes You to exercise the Licensed Rights in all
143
+ media and formats whether now known or hereafter created, and to
144
+ make technical modifications necessary to do so. The Licensor
145
+ waives and/or agrees not to assert any right or authority to
146
+ forbid You from making technical modifications necessary to
147
+ exercise the Licensed Rights, including technical modifications
148
+ necessary to circumvent Effective Technological Measures. For
149
+ purposes of this Public License, simply making modifications
150
+ authorized by this Section 2(a)(4) never produces Adapted
151
+ Material.
152
+ - 5. Downstream recipients.
153
+ - A. Offer from the Licensor – Licensed Material. Every
154
+ recipient of the Licensed Material automatically receives an
155
+ offer from the Licensor to exercise the Licensed Rights
156
+ under the terms and conditions of this Public License.
157
+ - B. Additional offer from the Licensor – Adapted Material.
158
+ Every recipient of Adapted Material from You automatically
159
+ receives an offer from the Licensor to exercise the Licensed
160
+ Rights in the Adapted Material under the conditions of the
161
+ Adapter's License You apply.
162
+ - C. No downstream restrictions. You may not offer or impose
163
+ any additional or different terms or conditions on, or apply
164
+ any Effective Technological Measures to, the Licensed
165
+ Material if doing so restricts exercise of the Licensed
166
+ Rights by any recipient of the Licensed Material.
167
+ - 6. No endorsement. Nothing in this Public License constitutes or
168
+ may be construed as permission to assert or imply that You are,
169
+ or that Your use of the Licensed Material is, connected with, or
170
+ sponsored, endorsed, or granted official status by, the Licensor
171
+ or others designated to receive attribution as provided in
172
+ Section 3(a)(1)(A)(i).
173
+ - b. Other rights.
174
+ - 1. Moral rights, such as the right of integrity, are not
175
+ licensed under this Public License, nor are publicity, privacy,
176
+ and/or other similar personality rights; however, to the extent
177
+ possible, the Licensor waives and/or agrees not to assert any
178
+ such rights held by the Licensor to the limited extent necessary
179
+ to allow You to exercise the Licensed Rights, but not otherwise.
180
+ - 2. Patent and trademark rights are not licensed under this
181
+ Public License.
182
+ - 3. To the extent possible, the Licensor waives any right to
183
+ collect royalties from You for the exercise of the Licensed
184
+ Rights, whether directly or through a collecting society under
185
+ any voluntary or waivable statutory or compulsory licensing
186
+ scheme. In all other cases the Licensor expressly reserves any
187
+ right to collect such royalties, including when the Licensed
188
+ Material is used other than for NonCommercial purposes.
189
+
190
+ Section 3 – License Conditions.
191
+
192
+ Your exercise of the Licensed Rights is expressly made subject to the
193
+ following conditions.
194
+
195
+ - a. Attribution.
196
+ - 1. If You Share the Licensed Material (including in modified
197
+ form), You must:
198
+ - A. retain the following if it is supplied by the Licensor
199
+ with the Licensed Material:
200
+ - i. identification of the creator(s) of the Licensed
201
+ Material and any others designated to receive
202
+ attribution, in any reasonable manner requested by the
203
+ Licensor (including by pseudonym if designated);
204
+ - ii. a copyright notice;
205
+ - iii. a notice that refers to this Public License;
206
+ - iv. a notice that refers to the disclaimer of
207
+ warranties;
208
+ - v. a URI or hyperlink to the Licensed Material to the
209
+ extent reasonably practicable;
210
+
211
+ - B. indicate if You modified the Licensed Material and retain
212
+ an indication of any previous modifications; and
213
+ - C. indicate the Licensed Material is licensed under this
214
+ Public License, and include the text of, or the URI or
215
+ hyperlink to, this Public License.
216
+ - 2. You may satisfy the conditions in Section 3(a)(1) in any
217
+ reasonable manner based on the medium, means, and context in
218
+ which You Share the Licensed Material. For example, it may be
219
+ reasonable to satisfy the conditions by providing a URI or
220
+ hyperlink to a resource that includes the required information.
221
+ - 3. If requested by the Licensor, You must remove any of the
222
+ information required by Section 3(a)(1)(A) to the extent
223
+ reasonably practicable.
224
+ - b. ShareAlike.In addition to the conditions in Section 3(a), if You
225
+ Share Adapted Material You produce, the following conditions also
226
+ apply.
227
+ - 1. The Adapter's License You apply must be a Creative Commons
228
+ license with the same License Elements, this version or later,
229
+ or a BY-NC-SA Compatible License.
230
+ - 2. You must include the text of, or the URI or hyperlink to, the
231
+ Adapter's License You apply. You may satisfy this condition in
232
+ any reasonable manner based on the medium, means, and context in
233
+ which You Share Adapted Material.
234
+ - 3. You may not offer or impose any additional or different terms
235
+ or conditions on, or apply any Effective Technological Measures
236
+ to, Adapted Material that restrict exercise of the rights
237
+ granted under the Adapter's License You apply.
238
+
239
+ Section 4 – Sui Generis Database Rights.
240
+
241
+ Where the Licensed Rights include Sui Generis Database Rights that apply
242
+ to Your use of the Licensed Material:
243
+
244
+ - a. for the avoidance of doubt, Section 2(a)(1) grants You the right
245
+ to extract, reuse, reproduce, and Share all or a substantial portion
246
+ of the contents of the database for NonCommercial purposes only;
247
+ - b. if You include all or a substantial portion of the database
248
+ contents in a database in which You have Sui Generis Database
249
+ Rights, then the database in which You have Sui Generis Database
250
+ Rights (but not its individual contents) is Adapted Material,
251
+ including for purposes of Section 3(b); and
252
+ - c. You must comply with the conditions in Section 3(a) if You Share
253
+ all or a substantial portion of the contents of the database.
254
+ For the avoidance of doubt, this Section 4 supplements and does not
255
+ replace Your obligations under this Public License where the
256
+ Licensed Rights include other Copyright and Similar Rights.
257
+
258
+ Section 5 – Disclaimer of Warranties and Limitation of Liability.
259
+
260
+ - a. Unless otherwise separately undertaken by the Licensor, to the
261
+ extent possible, the Licensor offers the Licensed Material as-is and
262
+ as-available, and makes no representations or warranties of any kind
263
+ concerning the Licensed Material, whether express, implied,
264
+ statutory, or other. This includes, without limitation, warranties
265
+ of title, merchantability, fitness for a particular purpose,
266
+ non-infringement, absence of latent or other defects, accuracy, or
267
+ the presence or absence of errors, whether or not known or
268
+ discoverable. Where disclaimers of warranties are not allowed in
269
+ full or in part, this disclaimer may not apply to You.
270
+ - b. To the extent possible, in no event will the Licensor be liable
271
+ to You on any legal theory (including, without limitation,
272
+ negligence) or otherwise for any direct, special, indirect,
273
+ incidental, consequential, punitive, exemplary, or other losses,
274
+ costs, expenses, or damages arising out of this Public License or
275
+ use of the Licensed Material, even if the Licensor has been advised
276
+ of the possibility of such losses, costs, expenses, or damages.
277
+ Where a limitation of liability is not allowed in full or in part,
278
+ this limitation may not apply to You.
279
+ - c. The disclaimer of warranties and limitation of liability provided
280
+ above shall be interpreted in a manner that, to the extent possible,
281
+ most closely approximates an absolute disclaimer and waiver of all
282
+ liability.
283
+
284
+ Section 6 – Term and Termination.
285
+
286
+ - a. This Public License applies for the term of the Copyright and
287
+ Similar Rights licensed here. However, if You fail to comply with
288
+ this Public License, then Your rights under this Public License
289
+ terminate automatically.
290
+ - b. Where Your right to use the Licensed Material has terminated
291
+ under Section 6(a), it reinstates:
292
+
293
+ - 1. automatically as of the date the violation is cured, provided
294
+ it is cured within 30 days of Your discovery of the violation;
295
+ or
296
+ - 2. upon express reinstatement by the Licensor.
297
+
298
+ For the avoidance of doubt, this Section 6(b) does not affect any
299
+ right the Licensor may have to seek remedies for Your violations of
300
+ this Public License.
301
+
302
+ - c. For the avoidance of doubt, the Licensor may also offer the
303
+ Licensed Material under separate terms or conditions or stop
304
+ distributing the Licensed Material at any time; however, doing so
305
+ will not terminate this Public License.
306
+ - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
307
+ License.
308
+
309
+ Section 7 – Other Terms and Conditions.
310
+
311
+ - a. The Licensor shall not be bound by any additional or different
312
+ terms or conditions communicated by You unless expressly agreed.
313
+ - b. Any arrangements, understandings, or agreements regarding the
314
+ Licensed Material not stated herein are separate from and
315
+ independent of the terms and conditions of this Public License.
316
+
317
+ Section 8 – Interpretation.
318
+
319
+ - a. For the avoidance of doubt, this Public License does not, and
320
+ shall not be interpreted to, reduce, limit, restrict, or impose
321
+ conditions on any use of the Licensed Material that could lawfully
322
+ be made without permission under this Public License.
323
+ - b. To the extent possible, if any provision of this Public License
324
+ is deemed unenforceable, it shall be automatically reformed to the
325
+ minimum extent necessary to make it enforceable. If the provision
326
+ cannot be reformed, it shall be severed from this Public License
327
+ without affecting the enforceability of the remaining terms and
328
+ conditions.
329
+ - c. No term or condition of this Public License will be waived and no
330
+ failure to comply consented to unless expressly agreed to by the
331
+ Licensor.
332
+ - d. Nothing in this Public License constitutes or may be interpreted
333
+ as a limitation upon, or waiver of, any privileges and immunities
334
+ that apply to the Licensor or You, including from the legal
335
+ processes of any jurisdiction or authority.
336
+
337
+ Creative Commons is not a party to its public licenses. Notwithstanding,
338
+ Creative Commons may elect to apply one of its public licenses to
339
+ material it publishes and in those instances will be considered the
340
+ "Licensor." The text of the Creative Commons public licenses is
341
+ dedicated to the public domain under the CC0 Public Domain Dedication.
342
+ Except for the limited purpose of indicating that material is shared
343
+ under a Creative Commons public license or as otherwise permitted by the
344
+ Creative Commons policies published at creativecommons.org/policies,
345
+ Creative Commons does not authorize the use of the trademark "Creative
346
+ Commons" or any other trademark or logo of Creative Commons without its
347
+ prior written consent including, without limitation, in connection with
348
+ any unauthorized modifications to any of its public licenses or any
349
+ other arrangements, understandings, or agreements concerning use of
350
+ licensed material. For the avoidance of doubt, this paragraph does not
351
+ form part of the public licenses.
352
+
353
+ Creative Commons may be contacted at creativecommons.org.
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dalla-data-processing
3
- Version: 0.0.3
3
+ Version: 0.0.10
4
4
  Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
5
  Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
+ License: CC-BY-NC-SA-4.0
6
7
  Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
7
8
  Project-URL: Documentation, https://github.com/U4RASD/dalla-data-processing#readme
8
9
  Project-URL: Repository, https://github.com/U4RASD/dalla-data-processing
@@ -11,11 +12,15 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
12
  Classifier: Intended Audience :: Developers
12
13
  Classifier: Intended Audience :: Science/Research
13
14
  Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
14
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
15
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
20
  Classifier: Topic :: Text Processing :: Linguistic
17
- Requires-Python: <3.13,>=3.12
21
+ Requires-Python: >=3.10
18
22
  Description-Content-Type: text/markdown
23
+ License-File: LICENSE
19
24
  Requires-Dist: datasets>=2.14.0
20
25
  Requires-Dist: transformers>=4.30.0
21
26
  Requires-Dist: click>=8.0.0
@@ -42,6 +47,7 @@ Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
42
47
  Requires-Dist: pyyaml; extra == "pack"
43
48
  Provides-Extra: all
44
49
  Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
50
+ Dynamic: license-file
45
51
 
46
52
  # Dalla Data Processing (dalla-dp)
47
53
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.0.3'
32
- __version_tuple__ = version_tuple = (0, 0, 3)
31
+ __version__ = version = '0.0.10'
32
+ __version_tuple__ = version_tuple = (0, 0, 10)
33
33
 
34
- __commit_id__ = commit_id = 'g37580acc9'
34
+ __commit_id__ = commit_id = 'gcc87ead80'
@@ -1,7 +1,7 @@
1
1
  include ../Makefile.config
2
2
 
3
3
  CC=g++
4
- CFLAGS=-Wall -O3
4
+ CFLAGS=-Wall -O3 -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
5
5
 
6
6
  OBJS=version.o buzhash.o
7
7
  TARGETS=hashgen hashdup onion
@@ -2,7 +2,7 @@ include ../Makefile.config
2
2
 
3
3
  CC=g++
4
4
  #CFLAGS=-Wall -O3
5
- CFLAGS=-Wall -g
5
+ CFLAGS=-Wall -g -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
6
6
 
7
7
  OBJS=version.o buzhash.o
8
8
  TARGETS=hashgen hashdup onion
@@ -1,7 +1,7 @@
1
1
  include ../Makefile.config
2
2
 
3
3
  CC=g++
4
- CFLAGS=-Wall -O3
4
+ CFLAGS=-Wall -O3 -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
5
5
 
6
6
  OBJS=version.o buzhash.o
7
7
  TARGETS=hashgen hashdup onion
@@ -2,7 +2,7 @@ include ../Makefile.config
2
2
 
3
3
  CC=g++
4
4
  #CFLAGS=-Wall -O3
5
- CFLAGS=-Wall -g
5
+ CFLAGS=-Wall -g -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
6
6
 
7
7
  OBJS=version.o buzhash.o
8
8
  TARGETS=hashgen hashdup onion
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dalla-data-processing
3
- Version: 0.0.3
3
+ Version: 0.0.10
4
4
  Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
5
  Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
+ License: CC-BY-NC-SA-4.0
6
7
  Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
7
8
  Project-URL: Documentation, https://github.com/U4RASD/dalla-data-processing#readme
8
9
  Project-URL: Repository, https://github.com/U4RASD/dalla-data-processing
@@ -11,11 +12,15 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
12
  Classifier: Intended Audience :: Developers
12
13
  Classifier: Intended Audience :: Science/Research
13
14
  Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
14
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
15
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
20
  Classifier: Topic :: Text Processing :: Linguistic
17
- Requires-Python: <3.13,>=3.12
21
+ Requires-Python: >=3.10
18
22
  Description-Content-Type: text/markdown
23
+ License-File: LICENSE
19
24
  Requires-Dist: datasets>=2.14.0
20
25
  Requires-Dist: transformers>=4.30.0
21
26
  Requires-Dist: click>=8.0.0
@@ -42,6 +47,7 @@ Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
42
47
  Requires-Dist: pyyaml; extra == "pack"
43
48
  Provides-Extra: all
44
49
  Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
50
+ Dynamic: license-file
45
51
 
46
52
  # Dalla Data Processing (dalla-dp)
47
53
 
@@ -1,6 +1,7 @@
1
1
  .dockerignore
2
2
  .gitignore
3
3
  .pre-commit-config.yaml
4
+ LICENSE
4
5
  MANIFEST.in
5
6
  README.md
6
7
  pyproject.toml
@@ -14,6 +15,7 @@ dalla_data_processing.egg-info/PKG-INFO
14
15
  dalla_data_processing.egg-info/SOURCES.txt
15
16
  dalla_data_processing.egg-info/dependency_links.txt
16
17
  dalla_data_processing.egg-info/entry_points.txt
18
+ dalla_data_processing.egg-info/not-zip-safe
17
19
  dalla_data_processing.egg-info/requires.txt
18
20
  dalla_data_processing.egg-info/top_level.txt
19
21
  dalla_data_processing/core/README.md
@@ -1,5 +1,5 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
2
+ requires = ["setuptools>=70.0", "setuptools-scm>=8.0", "wheel"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
@@ -11,13 +11,17 @@ authors = [
11
11
  {name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
12
12
  ]
13
13
  readme = "README.md"
14
- requires-python = ">=3.12,<3.13"
14
+ license = {text = "CC-BY-NC-SA-4.0"}
15
+ requires-python = ">=3.10"
15
16
  keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
16
17
  classifiers = [
17
18
  "Intended Audience :: Developers",
18
19
  "Intended Audience :: Science/Research",
19
20
  "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
20
23
  "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
21
25
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
26
  "Topic :: Text Processing :: Linguistic",
23
27
  ]
@@ -76,6 +80,8 @@ Repository = "https://github.com/U4RASD/dalla-data-processing"
76
80
  packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.packing", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
77
81
  include-package-data = true
78
82
 
83
+ zip-safe = false
84
+
79
85
  [tool.setuptools.package-data]
80
86
  dalla_data_processing = ["py.typed"]
81
87
  "dalla_data_processing.stemming" = ["data/*.txt"]
@@ -83,7 +89,7 @@ dalla_data_processing = ["py.typed"]
83
89
 
84
90
  [tool.ruff]
85
91
  line-length = 100
86
- target-version = "py312"
92
+ target-version = "py310"
87
93
  src = ["dalla_data_processing"]
88
94
 
89
95
  [tool.ruff.lint]
@@ -59,18 +59,26 @@ make clean 2>/dev/null || true
59
59
 
60
60
  # Set up compiler flags for macOS
61
61
  EXTRA_CFLAGS=""
62
+ EXTRA_LDFLAGS=""
62
63
  if [[ "$OSTYPE" == "darwin"* ]]; then
63
64
  # On macOS, add Homebrew paths for sparsehash
64
65
  if command -v brew &> /dev/null; then
65
66
  BREW_PREFIX=$(brew --prefix)
66
- EXTRA_CFLAGS="-I${BREW_PREFIX}/include"
67
+ SPARSEHASH_PREFIX=$(brew --prefix google-sparsehash 2>/dev/null || echo "${BREW_PREFIX}")
68
+ EXTRA_CFLAGS="-I${SPARSEHASH_PREFIX}/include"
67
69
  echo -e "${YELLOW}Using Homebrew prefix: ${BREW_PREFIX}${NC}"
70
+ echo -e "${YELLOW}Using sparsehash include: ${SPARSEHASH_PREFIX}/include${NC}"
68
71
  fi
72
+
73
+ # Build universal binary for macOS (x86_64 + arm64)
74
+ EXTRA_CFLAGS="${EXTRA_CFLAGS} -arch x86_64 -arch arm64"
75
+ EXTRA_LDFLAGS="-arch x86_64 -arch arm64"
76
+ echo -e "${YELLOW}Building universal binary (x86_64 + arm64)${NC}"
69
77
  fi
70
78
 
71
79
  # Build onion
72
80
  echo -e "${YELLOW}Compiling Onion...${NC}"
73
- if make CFLAGS="-Wall -O3 ${EXTRA_CFLAGS}"; then
81
+ if make CFLAGS="-Wall -O3 -std=c++11 ${EXTRA_CFLAGS}" LDFLAGS="${EXTRA_LDFLAGS}"; then
74
82
  echo -e "${GREEN}✓ Compilation successful${NC}"
75
83
  else
76
84
  echo -e "${RED}✗ Compilation failed${NC}"
@@ -86,6 +94,12 @@ fi
86
94
  # Get platform info
87
95
  PLATFORM=$(uname -s | tr '[:upper:]' '[:lower:]')
88
96
  ARCH=$(uname -m)
97
+
98
+ # On macOS, use "universal" as arch since we build a fat binary
99
+ if [[ "$OSTYPE" == "darwin"* ]]; then
100
+ ARCH="universal"
101
+ fi
102
+
89
103
  echo -e "${YELLOW}Platform: $PLATFORM-$ARCH${NC}"
90
104
 
91
105
  # Create output directory
@@ -102,6 +116,12 @@ ln -sf "onion-$PLATFORM-$ARCH" "$OUTPUT_DIR/onion"
102
116
  echo -e "${GREEN}=== Build Complete ===${NC}"
103
117
  echo -e "${GREEN}Binary location: $OUTPUT_DIR/onion-$PLATFORM-$ARCH${NC}"
104
118
 
119
+ # On macOS, verify it's a universal binary
120
+ if [[ "$OSTYPE" == "darwin"* ]]; then
121
+ echo -e "${YELLOW}Verifying universal binary architectures:${NC}"
122
+ lipo -info "$OUTPUT_DIR/onion-$PLATFORM-$ARCH"
123
+ fi
124
+
105
125
  # Verify binary works
106
126
  if "$OUTPUT_DIR/onion-$PLATFORM-$ARCH" -h &> /dev/null; then
107
127
  echo -e "${GREEN}✓ Binary is executable and working${NC}"