datasketch 1.7.0__tar.gz → 1.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datasketch-1.8.0/.gitignore +87 -0
  2. {datasketch-1.7.0/datasketch.egg-info → datasketch-1.8.0}/PKG-INFO +156 -53
  3. datasketch-1.7.0/PKG-INFO → datasketch-1.8.0/README.rst +118 -70
  4. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/__init__.py +34 -9
  5. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/b_bit_minhash.py +57 -72
  6. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/__init__.py +4 -7
  7. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/aio/lsh.py +101 -117
  8. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/aio/storage.py +105 -107
  9. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hashfunc.py +6 -3
  10. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hnsw.py +103 -151
  11. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hyperloglog.py +55 -66
  12. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lean_minhash.py +25 -25
  13. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lsh.py +189 -132
  14. datasketch-1.8.0/datasketch/lsh_bloom.py +377 -0
  15. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lshensemble.py +37 -45
  16. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lshensemble_partition.py +34 -33
  17. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lshforest.py +28 -36
  18. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/minhash.py +171 -47
  19. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/storage.py +175 -228
  20. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/weighted_minhash.py +23 -25
  21. datasketch-1.8.0/pyproject.toml +162 -0
  22. datasketch-1.7.0/README.rst +0 -88
  23. datasketch-1.7.0/datasketch/lsh_bloom.py +0 -335
  24. datasketch-1.7.0/datasketch/version.py +0 -1
  25. datasketch-1.7.0/datasketch.egg-info/SOURCES.txt +0 -38
  26. datasketch-1.7.0/datasketch.egg-info/dependency_links.txt +0 -1
  27. datasketch-1.7.0/datasketch.egg-info/requires.txt +0 -43
  28. datasketch-1.7.0/datasketch.egg-info/top_level.txt +0 -1
  29. datasketch-1.7.0/setup.cfg +0 -4
  30. datasketch-1.7.0/setup.py +0 -91
  31. datasketch-1.7.0/test/test_hnsw.py +0 -306
  32. datasketch-1.7.0/test/test_hyperloglog.py +0 -177
  33. datasketch-1.7.0/test/test_lean_minhash.py +0 -190
  34. datasketch-1.7.0/test/test_lsh.py +0 -451
  35. datasketch-1.7.0/test/test_lsh_cassandra.py +0 -269
  36. datasketch-1.7.0/test/test_lshbloom.py +0 -126
  37. datasketch-1.7.0/test/test_lshensemble.py +0 -92
  38. datasketch-1.7.0/test/test_lshforest.py +0 -149
  39. datasketch-1.7.0/test/test_minhash.py +0 -203
  40. datasketch-1.7.0/test/test_weighted_minhash.py +0 -108
  41. {datasketch-1.7.0 → datasketch-1.8.0}/LICENSE +0 -0
  42. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/aio/__init__.py +0 -0
  43. {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hyperloglog_const.py +0 -0
@@ -0,0 +1,87 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # C extensions
6
+ *.so
7
+
8
+ # VIM stuff
9
+ *.swp
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ env/
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *,cover
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Django stuff:
54
+ *.log
55
+
56
+ # Sphinx documentation
57
+ docs/_build/
58
+
59
+ # PyBuilder
60
+ target/
61
+
62
+ # Documentation
63
+ .doctrees
64
+ _build
65
+ doctrees
66
+
67
+ # Jetbrains
68
+ .idea
69
+
70
+ # Benchmark files
71
+ benchmark/**/*.inp.gz
72
+
73
+ # Benchmark output
74
+ benchmark/**/*.sqlite
75
+
76
+ # Benchmark plots
77
+ benchmark/**/*.png
78
+ benchmark/**/*.pdf
79
+
80
+ # Virtual env
81
+ .venv
82
+
83
+ # IDE
84
+ .vscode
85
+
86
+ # MacOS
87
+ .DS_Store
@@ -1,67 +1,57 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.7.0
3
+ Version: 1.8.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
- Home-page: https://ekzhu.github.io/datasketch
6
- Author: ekzhu
7
- Author-email: ekzhu@cs.toronto.edu
8
- License: MIT
5
+ Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
+ Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
7
+ Project-URL: Documentation, https://ekzhu.github.io/datasketch
9
8
  Project-URL: Source, https://github.com/ekzhu/datasketch
10
- Keywords: database datamining
9
+ Author-email: ekzhu <ekzhu@cs.toronto.edu>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: database,datamining
11
13
  Classifier: Development Status :: 5 - Production/Stable
12
14
  Classifier: Intended Audience :: Developers
13
- Classifier: Topic :: Database
14
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
15
  Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: Programming Language :: Python :: 3.12
21
- License-File: LICENSE
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Requires-Python: >=3.9
22
23
  Requires-Dist: numpy>=1.11
23
24
  Requires-Dist: scipy>=1.0.0
25
+ Provides-Extra: benchmark
26
+ Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
27
+ Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
28
+ Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
29
+ Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
30
+ Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
31
+ Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
32
+ Requires-Dist: scipy>=1.3.3; extra == 'benchmark'
33
+ Requires-Dist: setsimilaritysearch>=0.1.7; extra == 'benchmark'
34
+ Provides-Extra: bloom
35
+ Requires-Dist: pybloomfilter3>=0.7.2; extra == 'bloom'
24
36
  Provides-Extra: cassandra
25
- Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
37
+ Requires-Dist: cassandra-driver>=3.20; extra == 'cassandra'
38
+ Provides-Extra: experimental-aio
39
+ Requires-Dist: aiounittest; extra == 'experimental-aio'
40
+ Requires-Dist: motor>3.6.0; extra == 'experimental-aio'
26
41
  Provides-Extra: redis
27
- Requires-Dist: redis>=2.10.0; extra == "redis"
28
- Provides-Extra: bloom
29
- Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
30
- Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
31
- Provides-Extra: benchmark
32
- Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
33
- Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
34
- Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
35
- Requires-Dist: scipy>=1.3.3; extra == "benchmark"
36
- Requires-Dist: pandas>=0.25.3; extra == "benchmark"
37
- Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
38
- Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
39
- Requires-Dist: nltk>=3.4.5; extra == "benchmark"
42
+ Requires-Dist: redis>=2.10.0; extra == 'redis'
40
43
  Provides-Extra: test
41
- Requires-Dist: cassandra-driver>=3.20; extra == "test"
42
- Requires-Dist: redis>=2.10.0; extra == "test"
43
- Requires-Dist: mock>=2.0.0; extra == "test"
44
- Requires-Dist: mockredispy; extra == "test"
45
- Requires-Dist: coverage; extra == "test"
46
- Requires-Dist: pymongo>=3.9.0; extra == "test"
47
- Requires-Dist: nose>=1.3.7; extra == "test"
48
- Requires-Dist: nose-exclude>=0.5.0; extra == "test"
49
- Requires-Dist: pytest; extra == "test"
50
- Provides-Extra: experimental-aio
51
- Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
52
- Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
53
- Dynamic: author
54
- Dynamic: author-email
55
- Dynamic: classifier
56
- Dynamic: description
57
- Dynamic: home-page
58
- Dynamic: keywords
59
- Dynamic: license
60
- Dynamic: license-file
61
- Dynamic: project-url
62
- Dynamic: provides-extra
63
- Dynamic: requires-dist
64
- Dynamic: summary
44
+ Requires-Dist: cassandra-driver>=3.20; extra == 'test'
45
+ Requires-Dist: coverage; extra == 'test'
46
+ Requires-Dist: mock>=2.0.0; extra == 'test'
47
+ Requires-Dist: mockredispy; extra == 'test'
48
+ Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
49
+ Requires-Dist: nose>=1.3.7; extra == 'test'
50
+ Requires-Dist: pymongo>=3.9.0; extra == 'test'
51
+ Requires-Dist: pytest; extra == 'test'
52
+ Requires-Dist: pytest-rerunfailures; extra == 'test'
53
+ Requires-Dist: redis>=2.10.0; extra == 'test'
54
+ Description-Content-Type: text/x-rst
65
55
 
66
56
  datasketch: Big Data Looks Small
67
57
  ================================
@@ -107,7 +97,7 @@ sub-linear query time:
107
97
  | `HNSW`_ | Any | Custom Metric Top-K |
108
98
  +---------------------------+-----------------------------+------------------------+
109
99
 
110
- datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
100
+ datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
111
101
 
112
102
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
113
103
  storage layer (see `MinHash LSH at Scale`_).
@@ -117,7 +107,7 @@ Install
117
107
 
118
108
  To install datasketch using ``pip``:
119
109
 
120
- ::
110
+ .. code-block:: bash
121
111
 
122
112
  pip install datasketch
123
113
 
@@ -125,19 +115,19 @@ This will also install NumPy as dependency.
125
115
 
126
116
  To install with Redis dependency:
127
117
 
128
- ::
118
+ .. code-block:: bash
129
119
 
130
120
  pip install datasketch[redis]
131
121
 
132
122
  To install with Cassandra dependency:
133
123
 
134
- ::
124
+ .. code-block:: bash
135
125
 
136
126
  pip install datasketch[cassandra]
137
127
 
138
128
  To install with Bloom filter dependency:
139
129
 
140
- ::
130
+ .. code-block:: bash
141
131
 
142
132
  pip install datasketch[bloom]
143
133
 
@@ -151,3 +141,116 @@ To install with Bloom filter dependency:
151
141
  .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
152
142
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
153
143
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
144
+
145
+ Contributing
146
+ ------------
147
+
148
+ We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
149
+
150
+ Development Setup
151
+ ^^^^^^^^^^^^^^^^^
152
+
153
+ The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
154
+
155
+ 1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
156
+
157
+ 2. **Clone the repository**:
158
+
159
+ .. code-block:: bash
160
+
161
+ git clone https://github.com/ekzhu/datasketch.git
162
+ cd datasketch
163
+
164
+ 3. **Set up the environment**:
165
+
166
+ .. code-block:: bash
167
+
168
+ # Create a virtual environment
169
+ # (Optional: specify Python version with --python 3.x)
170
+ uv venv
171
+ # Activate the virtual environment (optional, uv run commands work without it)
172
+ source .venv/bin/activate
173
+
174
+ # Install all dependencies
175
+ uv sync
176
+
177
+ 4. **Verify installation**:
178
+
179
+ .. code-block:: bash
180
+
181
+ # Run tests to ensure everything works
182
+ uv run pytest
183
+
184
+ 5. **Optional dependencies** (for specific development needs):
185
+
186
+ .. code-block:: bash
187
+
188
+ # For testing
189
+ uv sync --extra test
190
+
191
+ # For Cassandra support
192
+ uv sync --extra cassandra
193
+
194
+ # For Redis support
195
+ uv sync --extra redis
196
+
197
+ # For all extras
198
+ uv sync --all-extras
199
+
200
+ Learn more about `uv` at https://docs.astral.sh/uv/
201
+
202
+ Development Workflow
203
+ ^^^^^^^^^^^^^^^^^^^^
204
+
205
+ 1. **Fork the repository** on GitHub if you haven't already.
206
+
207
+ 2. **Create a feature branch** for your changes:
208
+
209
+ .. code-block:: bash
210
+
211
+ git checkout -b feature/your-feature-name
212
+ # Or for bug fixes:
213
+ git checkout -b fix/issue-description
214
+
215
+ 3. **Make your changes** following the project's coding standards.
216
+
217
+ 4. **Run the tests** to ensure nothing is broken:
218
+
219
+ .. code-block:: bash
220
+
221
+ uv run pytest
222
+
223
+ 5. **Check code quality** with ruff:
224
+
225
+ .. code-block:: bash
226
+
227
+ # Check for issues
228
+ uvx ruff check .
229
+
230
+ # Auto-fix formatting issues
231
+ uvx ruff format .
232
+
233
+ 6. **Commit your changes** with a clear, descriptive commit message:
234
+
235
+ .. code-block:: bash
236
+
237
+ git commit -m "Add feature: brief description of what was changed"
238
+
239
+ 7. **Push to your fork** and create a pull request on GitHub:
240
+
241
+ .. code-block:: bash
242
+
243
+ git push origin your-branch-name
244
+
245
+ 8. **Respond to feedback** from maintainers and iterate on your changes.
246
+
247
+ Guidelines
248
+ ^^^^^^^^^^
249
+
250
+ - Follow PEP 8 style guidelines
251
+ - Write tests for new features
252
+ - Update documentation as needed
253
+ - Keep commits focused and atomic
254
+ - Be respectful in discussions
255
+
256
+ For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
@@ -1,68 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: datasketch
3
- Version: 1.7.0
4
- Summary: Probabilistic data structures for processing and searching very large datasets
5
- Home-page: https://ekzhu.github.io/datasketch
6
- Author: ekzhu
7
- Author-email: ekzhu@cs.toronto.edu
8
- License: MIT
9
- Project-URL: Source, https://github.com/ekzhu/datasketch
10
- Keywords: database datamining
11
- Classifier: Development Status :: 5 - Production/Stable
12
- Classifier: Intended Audience :: Developers
13
- Classifier: Topic :: Database
14
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
- Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Programming Language :: Python :: 3.8
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- License-File: LICENSE
22
- Requires-Dist: numpy>=1.11
23
- Requires-Dist: scipy>=1.0.0
24
- Provides-Extra: cassandra
25
- Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
26
- Provides-Extra: redis
27
- Requires-Dist: redis>=2.10.0; extra == "redis"
28
- Provides-Extra: bloom
29
- Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
30
- Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
31
- Provides-Extra: benchmark
32
- Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
33
- Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
34
- Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
35
- Requires-Dist: scipy>=1.3.3; extra == "benchmark"
36
- Requires-Dist: pandas>=0.25.3; extra == "benchmark"
37
- Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
38
- Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
39
- Requires-Dist: nltk>=3.4.5; extra == "benchmark"
40
- Provides-Extra: test
41
- Requires-Dist: cassandra-driver>=3.20; extra == "test"
42
- Requires-Dist: redis>=2.10.0; extra == "test"
43
- Requires-Dist: mock>=2.0.0; extra == "test"
44
- Requires-Dist: mockredispy; extra == "test"
45
- Requires-Dist: coverage; extra == "test"
46
- Requires-Dist: pymongo>=3.9.0; extra == "test"
47
- Requires-Dist: nose>=1.3.7; extra == "test"
48
- Requires-Dist: nose-exclude>=0.5.0; extra == "test"
49
- Requires-Dist: pytest; extra == "test"
50
- Provides-Extra: experimental-aio
51
- Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
52
- Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
53
- Dynamic: author
54
- Dynamic: author-email
55
- Dynamic: classifier
56
- Dynamic: description
57
- Dynamic: home-page
58
- Dynamic: keywords
59
- Dynamic: license
60
- Dynamic: license-file
61
- Dynamic: project-url
62
- Dynamic: provides-extra
63
- Dynamic: requires-dist
64
- Dynamic: summary
65
-
66
1
  datasketch: Big Data Looks Small
67
2
  ================================
68
3
 
@@ -107,7 +42,7 @@ sub-linear query time:
107
42
  | `HNSW`_ | Any | Custom Metric Top-K |
108
43
  +---------------------------+-----------------------------+------------------------+
109
44
 
110
- datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
45
+ datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
111
46
 
112
47
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
113
48
  storage layer (see `MinHash LSH at Scale`_).
@@ -117,7 +52,7 @@ Install
117
52
 
118
53
  To install datasketch using ``pip``:
119
54
 
120
- ::
55
+ .. code-block:: bash
121
56
 
122
57
  pip install datasketch
123
58
 
@@ -125,19 +60,19 @@ This will also install NumPy as dependency.
125
60
 
126
61
  To install with Redis dependency:
127
62
 
128
- ::
63
+ .. code-block:: bash
129
64
 
130
65
  pip install datasketch[redis]
131
66
 
132
67
  To install with Cassandra dependency:
133
68
 
134
- ::
69
+ .. code-block:: bash
135
70
 
136
71
  pip install datasketch[cassandra]
137
72
 
138
73
  To install with Bloom filter dependency:
139
74
 
140
- ::
75
+ .. code-block:: bash
141
76
 
142
77
  pip install datasketch[bloom]
143
78
 
@@ -151,3 +86,116 @@ To install with Bloom filter dependency:
151
86
  .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
152
87
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
153
88
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
89
+
90
+ Contributing
91
+ ------------
92
+
93
+ We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
94
+
95
+ Development Setup
96
+ ^^^^^^^^^^^^^^^^^
97
+
98
+ The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
99
+
100
+ 1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
101
+
102
+ 2. **Clone the repository**:
103
+
104
+ .. code-block:: bash
105
+
106
+ git clone https://github.com/ekzhu/datasketch.git
107
+ cd datasketch
108
+
109
+ 3. **Set up the environment**:
110
+
111
+ .. code-block:: bash
112
+
113
+ # Create a virtual environment
114
+ # (Optional: specify Python version with --python 3.x)
115
+ uv venv
116
+ # Activate the virtual environment (optional, uv run commands work without it)
117
+ source .venv/bin/activate
118
+
119
+ # Install all dependencies
120
+ uv sync
121
+
122
+ 4. **Verify installation**:
123
+
124
+ .. code-block:: bash
125
+
126
+ # Run tests to ensure everything works
127
+ uv run pytest
128
+
129
+ 5. **Optional dependencies** (for specific development needs):
130
+
131
+ .. code-block:: bash
132
+
133
+ # For testing
134
+ uv sync --extra test
135
+
136
+ # For Cassandra support
137
+ uv sync --extra cassandra
138
+
139
+ # For Redis support
140
+ uv sync --extra redis
141
+
142
+ # For all extras
143
+ uv sync --all-extras
144
+
145
+ Learn more about `uv` at https://docs.astral.sh/uv/
146
+
147
+ Development Workflow
148
+ ^^^^^^^^^^^^^^^^^^^^
149
+
150
+ 1. **Fork the repository** on GitHub if you haven't already.
151
+
152
+ 2. **Create a feature branch** for your changes:
153
+
154
+ .. code-block:: bash
155
+
156
+ git checkout -b feature/your-feature-name
157
+ # Or for bug fixes:
158
+ git checkout -b fix/issue-description
159
+
160
+ 3. **Make your changes** following the project's coding standards.
161
+
162
+ 4. **Run the tests** to ensure nothing is broken:
163
+
164
+ .. code-block:: bash
165
+
166
+ uv run pytest
167
+
168
+ 5. **Check code quality** with ruff:
169
+
170
+ .. code-block:: bash
171
+
172
+ # Check for issues
173
+ uvx ruff check .
174
+
175
+ # Auto-fix formatting issues
176
+ uvx ruff format .
177
+
178
+ 6. **Commit your changes** with a clear, descriptive commit message:
179
+
180
+ .. code-block:: bash
181
+
182
+ git commit -m "Add feature: brief description of what was changed"
183
+
184
+ 7. **Push to your fork** and create a pull request on GitHub:
185
+
186
+ .. code-block:: bash
187
+
188
+ git push origin your-branch-name
189
+
190
+ 8. **Respond to feedback** from maintainers and iterate on your changes.
191
+
192
+ Guidelines
193
+ ^^^^^^^^^^
194
+
195
+ - Follow PEP 8 style guidelines
196
+ - Write tests for new features
197
+ - Update documentation as needed
198
+ - Keep commits focused and atomic
199
+ - Be respectful in discussions
200
+
201
+ For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
@@ -1,18 +1,43 @@
1
- from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
2
- from datasketch.minhash import MinHash
1
+ import importlib.metadata
2
+ from typing import Final
3
+
4
+ try:
5
+ _version = importlib.metadata.version(__name__)
6
+ except importlib.metadata.PackageNotFoundError:
7
+ _version = "0.0.0" # Fallback for development mode
8
+ __version__: Final[str] = _version
9
+
3
10
  from datasketch.b_bit_minhash import bBitMinHash
11
+ from datasketch.hashfunc import sha1_hash32
12
+ from datasketch.hnsw import HNSW
13
+ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
14
+ from datasketch.lean_minhash import LeanMinHash
4
15
  from datasketch.lsh import MinHashLSH
5
16
  from datasketch.lsh_bloom import MinHashLSHBloom
6
- from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
7
- from datasketch.lshforest import MinHashLSHForest
8
17
  from datasketch.lshensemble import MinHashLSHEnsemble
9
- from datasketch.lean_minhash import LeanMinHash
10
- from datasketch.hashfunc import sha1_hash32
11
- from datasketch.hnsw import HNSW
18
+ from datasketch.lshforest import MinHashLSHForest
19
+ from datasketch.minhash import MinHash
20
+ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
12
21
 
13
22
  # Alias
14
23
  WeightedMinHashLSH = MinHashLSH
15
24
  WeightedMinHashLSHForest = MinHashLSHForest
16
25
 
17
- # Version
18
- from datasketch.version import __version__
26
+
27
+ __all__ = [
28
+ "HNSW",
29
+ "HyperLogLog",
30
+ "HyperLogLogPlusPlus",
31
+ "LeanMinHash",
32
+ "MinHash",
33
+ "MinHashLSH",
34
+ "MinHashLSHBloom",
35
+ "MinHashLSHEnsemble",
36
+ "MinHashLSHForest",
37
+ "WeightedMinHash",
38
+ "WeightedMinHashGenerator",
39
+ "WeightedMinHashLSH",
40
+ "WeightedMinHashLSHForest",
41
+ "bBitMinHash",
42
+ "sha1_hash32",
43
+ ]