datasketch 1.7.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. datasketch-1.9.0/.gitignore +87 -0
  2. datasketch-1.9.0/PKG-INFO +262 -0
  3. datasketch-1.7.0/PKG-INFO → datasketch-1.9.0/README.rst +121 -70
  4. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/__init__.py +34 -9
  5. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/b_bit_minhash.py +57 -72
  6. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/__init__.py +4 -7
  7. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/aio/lsh.py +110 -118
  8. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/aio/storage.py +138 -122
  9. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hashfunc.py +6 -3
  10. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hnsw.py +103 -151
  11. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hyperloglog.py +55 -66
  12. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lean_minhash.py +25 -25
  13. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lsh.py +199 -134
  14. datasketch-1.9.0/datasketch/lsh_bloom.py +377 -0
  15. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lshensemble.py +39 -46
  16. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lshensemble_partition.py +34 -33
  17. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lshforest.py +28 -36
  18. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/minhash.py +176 -49
  19. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/storage.py +188 -233
  20. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/weighted_minhash.py +28 -30
  21. datasketch-1.9.0/pyproject.toml +197 -0
  22. datasketch-1.7.0/README.rst +0 -88
  23. datasketch-1.7.0/datasketch/lsh_bloom.py +0 -335
  24. datasketch-1.7.0/datasketch/version.py +0 -1
  25. datasketch-1.7.0/datasketch.egg-info/PKG-INFO +0 -153
  26. datasketch-1.7.0/datasketch.egg-info/SOURCES.txt +0 -38
  27. datasketch-1.7.0/datasketch.egg-info/dependency_links.txt +0 -1
  28. datasketch-1.7.0/datasketch.egg-info/requires.txt +0 -43
  29. datasketch-1.7.0/datasketch.egg-info/top_level.txt +0 -1
  30. datasketch-1.7.0/setup.cfg +0 -4
  31. datasketch-1.7.0/setup.py +0 -91
  32. datasketch-1.7.0/test/test_hnsw.py +0 -306
  33. datasketch-1.7.0/test/test_hyperloglog.py +0 -177
  34. datasketch-1.7.0/test/test_lean_minhash.py +0 -190
  35. datasketch-1.7.0/test/test_lsh.py +0 -451
  36. datasketch-1.7.0/test/test_lsh_cassandra.py +0 -269
  37. datasketch-1.7.0/test/test_lshbloom.py +0 -126
  38. datasketch-1.7.0/test/test_lshensemble.py +0 -92
  39. datasketch-1.7.0/test/test_lshforest.py +0 -149
  40. datasketch-1.7.0/test/test_minhash.py +0 -203
  41. datasketch-1.7.0/test/test_weighted_minhash.py +0 -108
  42. {datasketch-1.7.0 → datasketch-1.9.0}/LICENSE +0 -0
  43. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/aio/__init__.py +0 -0
  44. {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hyperloglog_const.py +0 -0
@@ -0,0 +1,87 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # C extensions
6
+ *.so
7
+
8
+ # VIM stuff
9
+ *.swp
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ env/
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *,cover
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Django stuff:
54
+ *.log
55
+
56
+ # Sphinx documentation
57
+ docs/_build/
58
+
59
+ # PyBuilder
60
+ target/
61
+
62
+ # Documentation
63
+ .doctrees
64
+ _build
65
+ doctrees
66
+
67
+ # Jetbrains
68
+ .idea
69
+
70
+ # Benchmark files
71
+ benchmark/**/*.inp.gz
72
+
73
+ # Benchmark output
74
+ benchmark/**/*.sqlite
75
+
76
+ # Benchmark plots
77
+ benchmark/**/*.png
78
+ benchmark/**/*.pdf
79
+
80
+ # Virtual env
81
+ .venv
82
+
83
+ # IDE
84
+ .vscode
85
+
86
+ # MacOS
87
+ .DS_Store
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasketch
3
+ Version: 1.9.0
4
+ Summary: Probabilistic data structures for processing and searching very large datasets
5
+ Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
+ Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
7
+ Project-URL: Documentation, https://ekzhu.github.io/datasketch
8
+ Project-URL: Source, https://github.com/ekzhu/datasketch
9
+ Author-email: ekzhu <ekzhu@cs.toronto.edu>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: database,datamining
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: numpy>=1.11
25
+ Requires-Dist: scipy>=1.0.0
26
+ Provides-Extra: benchmark
27
+ Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
28
+ Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
29
+ Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
30
+ Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
31
+ Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
32
+ Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
33
+ Requires-Dist: scipy>=1.3.3; extra == 'benchmark'
34
+ Requires-Dist: setsimilaritysearch>=0.1.7; extra == 'benchmark'
35
+ Provides-Extra: bloom
36
+ Requires-Dist: pybloomfilter3>=0.7.2; extra == 'bloom'
37
+ Provides-Extra: cassandra
38
+ Requires-Dist: cassandra-driver>=3.20; extra == 'cassandra'
39
+ Provides-Extra: experimental-aio
40
+ Requires-Dist: aiounittest; extra == 'experimental-aio'
41
+ Requires-Dist: motor>3.6.0; extra == 'experimental-aio'
42
+ Provides-Extra: redis
43
+ Requires-Dist: redis>=2.10.0; extra == 'redis'
44
+ Provides-Extra: test
45
+ Requires-Dist: cassandra-driver>=3.20; extra == 'test'
46
+ Requires-Dist: coverage; extra == 'test'
47
+ Requires-Dist: mock>=2.0.0; extra == 'test'
48
+ Requires-Dist: mockredispy; extra == 'test'
49
+ Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
50
+ Requires-Dist: nose>=1.3.7; extra == 'test'
51
+ Requires-Dist: pymongo>=3.9.0; extra == 'test'
52
+ Requires-Dist: pytest; extra == 'test'
53
+ Requires-Dist: pytest-asyncio; extra == 'test'
54
+ Requires-Dist: pytest-cov; extra == 'test'
55
+ Requires-Dist: pytest-rerunfailures; extra == 'test'
56
+ Requires-Dist: redis>=2.10.0; extra == 'test'
57
+ Description-Content-Type: text/x-rst
58
+
59
+ datasketch: Big Data Looks Small
60
+ ================================
61
+
62
+ .. image:: https://static.pepy.tech/badge/datasketch/month
63
+ :target: https://pepy.tech/project/datasketch
64
+
65
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
66
+ :target: https://zenodo.org/doi/10.5281/zenodo.598238
67
+
68
+ .. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
69
+ :target: https://codecov.io/gh/ekzhu/datasketch
70
+
71
+ datasketch gives you probabilistic data structures that can process and
72
+ search very large amount of data super fast, with little loss of
73
+ accuracy.
74
+
75
+ This package contains the following data sketches:
76
+
77
+ +-------------------------+-----------------------------------------------+
78
+ | Data Sketch | Usage |
79
+ +=========================+===============================================+
80
+ | `MinHash`_ | estimate Jaccard similarity and cardinality |
81
+ +-------------------------+-----------------------------------------------+
82
+ | `Weighted MinHash`_ | estimate weighted Jaccard similarity |
83
+ +-------------------------+-----------------------------------------------+
84
+ | `HyperLogLog`_ | estimate cardinality |
85
+ +-------------------------+-----------------------------------------------+
86
+ | `HyperLogLog++`_ | estimate cardinality |
87
+ +-------------------------+-----------------------------------------------+
88
+
89
+ The following indexes for data sketches are provided to support
90
+ sub-linear query time:
91
+
92
+ +---------------------------+-----------------------------+------------------------+
93
+ | Index | For Data Sketch | Supported Query Type |
94
+ +===========================+=============================+========================+
95
+ | `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
96
+ +---------------------------+-----------------------------+------------------------+
97
+ | `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
98
+ +---------------------------+-----------------------------+------------------------+
99
+ | `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
100
+ +---------------------------+-----------------------------+------------------------+
101
+ | `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
102
+ +---------------------------+-----------------------------+------------------------+
103
+ | `HNSW`_ | Any | Custom Metric Top-K |
104
+ +---------------------------+-----------------------------+------------------------+
105
+
106
+ datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
107
+
108
+ Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
109
+ storage layer (see `MinHash LSH at Scale`_).
110
+
111
+ Install
112
+ -------
113
+
114
+ To install datasketch using ``pip``:
115
+
116
+ .. code-block:: bash
117
+
118
+ pip install datasketch
119
+
120
+ This will also install NumPy as dependency.
121
+
122
+ To install with Redis dependency:
123
+
124
+ .. code-block:: bash
125
+
126
+ pip install datasketch[redis]
127
+
128
+ To install with Cassandra dependency:
129
+
130
+ .. code-block:: bash
131
+
132
+ pip install datasketch[cassandra]
133
+
134
+ To install with Bloom filter dependency:
135
+
136
+ .. code-block:: bash
137
+
138
+ pip install datasketch[bloom]
139
+
140
+ .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
141
+ .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
142
+ .. _`HyperLogLog`: https://ekzhu.github.io/datasketch/hyperloglog.html
143
+ .. _`HyperLogLog++`: https://ekzhu.github.io/datasketch/hyperloglog.html#hyperloglog-plusplus
144
+ .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
145
+ .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
146
+ .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
147
+ .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
148
+ .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
149
+ .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
150
+
151
+ Contributing
152
+ ------------
153
+
154
+ We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
155
+
156
+ Development Setup
157
+ ^^^^^^^^^^^^^^^^^
158
+
159
+ The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
160
+
161
+ 1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
162
+
163
+ 2. **Clone the repository**:
164
+
165
+ .. code-block:: bash
166
+
167
+ git clone https://github.com/ekzhu/datasketch.git
168
+ cd datasketch
169
+
170
+ 3. **Set up the environment**:
171
+
172
+ .. code-block:: bash
173
+
174
+ # Create a virtual environment
175
+ # (Optional: specify Python version with --python 3.x)
176
+ uv venv
177
+ # Activate the virtual environment (optional, uv run commands work without it)
178
+ source .venv/bin/activate
179
+
180
+ # Install all dependencies
181
+ uv sync
182
+
183
+ 4. **Verify installation**:
184
+
185
+ .. code-block:: bash
186
+
187
+ # Run tests to ensure everything works
188
+ uv run pytest
189
+
190
+ 5. **Optional dependencies** (for specific development needs):
191
+
192
+ .. code-block:: bash
193
+
194
+ # For testing
195
+ uv sync --extra test
196
+
197
+ # For Cassandra support
198
+ uv sync --extra cassandra
199
+
200
+ # For Redis support
201
+ uv sync --extra redis
202
+
203
+ # For all extras
204
+ uv sync --all-extras
205
+
206
+ Learn more about `uv` at https://docs.astral.sh/uv/
207
+
208
+ Development Workflow
209
+ ^^^^^^^^^^^^^^^^^^^^
210
+
211
+ 1. **Fork the repository** on GitHub if you haven't already.
212
+
213
+ 2. **Create a feature branch** for your changes:
214
+
215
+ .. code-block:: bash
216
+
217
+ git checkout -b feature/your-feature-name
218
+ # Or for bug fixes:
219
+ git checkout -b fix/issue-description
220
+
221
+ 3. **Make your changes** following the project's coding standards.
222
+
223
+ 4. **Run the tests** to ensure nothing is broken:
224
+
225
+ .. code-block:: bash
226
+
227
+ uv run pytest
228
+
229
+ 5. **Check code quality** with ruff:
230
+
231
+ .. code-block:: bash
232
+
233
+ # Check for issues
234
+ uvx ruff check .
235
+
236
+ # Auto-fix formatting issues
237
+ uvx ruff format .
238
+
239
+ 6. **Commit your changes** with a clear, descriptive commit message:
240
+
241
+ .. code-block:: bash
242
+
243
+ git commit -m "Add feature: brief description of what was changed"
244
+
245
+ 7. **Push to your fork** and create a pull request on GitHub:
246
+
247
+ .. code-block:: bash
248
+
249
+ git push origin your-branch-name
250
+
251
+ 8. **Respond to feedback** from maintainers and iterate on your changes.
252
+
253
+ Guidelines
254
+ ^^^^^^^^^^
255
+
256
+ - Follow PEP 8 style guidelines
257
+ - Write tests for new features
258
+ - Update documentation as needed
259
+ - Keep commits focused and atomic
260
+ - Be respectful in discussions
261
+
262
+ For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
@@ -1,68 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: datasketch
3
- Version: 1.7.0
4
- Summary: Probabilistic data structures for processing and searching very large datasets
5
- Home-page: https://ekzhu.github.io/datasketch
6
- Author: ekzhu
7
- Author-email: ekzhu@cs.toronto.edu
8
- License: MIT
9
- Project-URL: Source, https://github.com/ekzhu/datasketch
10
- Keywords: database datamining
11
- Classifier: Development Status :: 5 - Production/Stable
12
- Classifier: Intended Audience :: Developers
13
- Classifier: Topic :: Database
14
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
- Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Programming Language :: Python :: 3.8
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- License-File: LICENSE
22
- Requires-Dist: numpy>=1.11
23
- Requires-Dist: scipy>=1.0.0
24
- Provides-Extra: cassandra
25
- Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
26
- Provides-Extra: redis
27
- Requires-Dist: redis>=2.10.0; extra == "redis"
28
- Provides-Extra: bloom
29
- Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
30
- Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
31
- Provides-Extra: benchmark
32
- Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
33
- Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
34
- Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
35
- Requires-Dist: scipy>=1.3.3; extra == "benchmark"
36
- Requires-Dist: pandas>=0.25.3; extra == "benchmark"
37
- Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
38
- Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
39
- Requires-Dist: nltk>=3.4.5; extra == "benchmark"
40
- Provides-Extra: test
41
- Requires-Dist: cassandra-driver>=3.20; extra == "test"
42
- Requires-Dist: redis>=2.10.0; extra == "test"
43
- Requires-Dist: mock>=2.0.0; extra == "test"
44
- Requires-Dist: mockredispy; extra == "test"
45
- Requires-Dist: coverage; extra == "test"
46
- Requires-Dist: pymongo>=3.9.0; extra == "test"
47
- Requires-Dist: nose>=1.3.7; extra == "test"
48
- Requires-Dist: nose-exclude>=0.5.0; extra == "test"
49
- Requires-Dist: pytest; extra == "test"
50
- Provides-Extra: experimental-aio
51
- Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
52
- Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
53
- Dynamic: author
54
- Dynamic: author-email
55
- Dynamic: classifier
56
- Dynamic: description
57
- Dynamic: home-page
58
- Dynamic: keywords
59
- Dynamic: license
60
- Dynamic: license-file
61
- Dynamic: project-url
62
- Dynamic: provides-extra
63
- Dynamic: requires-dist
64
- Dynamic: summary
65
-
66
1
  datasketch: Big Data Looks Small
67
2
  ================================
68
3
 
@@ -72,6 +7,9 @@ datasketch: Big Data Looks Small
72
7
  .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
73
8
  :target: https://zenodo.org/doi/10.5281/zenodo.598238
74
9
 
10
+ .. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
11
+ :target: https://codecov.io/gh/ekzhu/datasketch
12
+
75
13
  datasketch gives you probabilistic data structures that can process and
76
14
  search very large amount of data super fast, with little loss of
77
15
  accuracy.
@@ -107,7 +45,7 @@ sub-linear query time:
107
45
  | `HNSW`_ | Any | Custom Metric Top-K |
108
46
  +---------------------------+-----------------------------+------------------------+
109
47
 
110
- datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
48
+ datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
111
49
 
112
50
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
113
51
  storage layer (see `MinHash LSH at Scale`_).
@@ -117,7 +55,7 @@ Install
117
55
 
118
56
  To install datasketch using ``pip``:
119
57
 
120
- ::
58
+ .. code-block:: bash
121
59
 
122
60
  pip install datasketch
123
61
 
@@ -125,19 +63,19 @@ This will also install NumPy as dependency.
125
63
 
126
64
  To install with Redis dependency:
127
65
 
128
- ::
66
+ .. code-block:: bash
129
67
 
130
68
  pip install datasketch[redis]
131
69
 
132
70
  To install with Cassandra dependency:
133
71
 
134
- ::
72
+ .. code-block:: bash
135
73
 
136
74
  pip install datasketch[cassandra]
137
75
 
138
76
  To install with Bloom filter dependency:
139
77
 
140
- ::
78
+ .. code-block:: bash
141
79
 
142
80
  pip install datasketch[bloom]
143
81
 
@@ -151,3 +89,116 @@ To install with Bloom filter dependency:
151
89
  .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
152
90
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
153
91
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
92
+
93
+ Contributing
94
+ ------------
95
+
96
+ We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
97
+
98
+ Development Setup
99
+ ^^^^^^^^^^^^^^^^^
100
+
101
+ The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
102
+
103
+ 1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
104
+
105
+ 2. **Clone the repository**:
106
+
107
+ .. code-block:: bash
108
+
109
+ git clone https://github.com/ekzhu/datasketch.git
110
+ cd datasketch
111
+
112
+ 3. **Set up the environment**:
113
+
114
+ .. code-block:: bash
115
+
116
+ # Create a virtual environment
117
+ # (Optional: specify Python version with --python 3.x)
118
+ uv venv
119
+ # Activate the virtual environment (optional, uv run commands work without it)
120
+ source .venv/bin/activate
121
+
122
+ # Install all dependencies
123
+ uv sync
124
+
125
+ 4. **Verify installation**:
126
+
127
+ .. code-block:: bash
128
+
129
+ # Run tests to ensure everything works
130
+ uv run pytest
131
+
132
+ 5. **Optional dependencies** (for specific development needs):
133
+
134
+ .. code-block:: bash
135
+
136
+ # For testing
137
+ uv sync --extra test
138
+
139
+ # For Cassandra support
140
+ uv sync --extra cassandra
141
+
142
+ # For Redis support
143
+ uv sync --extra redis
144
+
145
+ # For all extras
146
+ uv sync --all-extras
147
+
148
+ Learn more about `uv` at https://docs.astral.sh/uv/
149
+
150
+ Development Workflow
151
+ ^^^^^^^^^^^^^^^^^^^^
152
+
153
+ 1. **Fork the repository** on GitHub if you haven't already.
154
+
155
+ 2. **Create a feature branch** for your changes:
156
+
157
+ .. code-block:: bash
158
+
159
+ git checkout -b feature/your-feature-name
160
+ # Or for bug fixes:
161
+ git checkout -b fix/issue-description
162
+
163
+ 3. **Make your changes** following the project's coding standards.
164
+
165
+ 4. **Run the tests** to ensure nothing is broken:
166
+
167
+ .. code-block:: bash
168
+
169
+ uv run pytest
170
+
171
+ 5. **Check code quality** with ruff:
172
+
173
+ .. code-block:: bash
174
+
175
+ # Check for issues
176
+ uvx ruff check .
177
+
178
+ # Auto-fix formatting issues
179
+ uvx ruff format .
180
+
181
+ 6. **Commit your changes** with a clear, descriptive commit message:
182
+
183
+ .. code-block:: bash
184
+
185
+ git commit -m "Add feature: brief description of what was changed"
186
+
187
+ 7. **Push to your fork** and create a pull request on GitHub:
188
+
189
+ .. code-block:: bash
190
+
191
+ git push origin your-branch-name
192
+
193
+ 8. **Respond to feedback** from maintainers and iterate on your changes.
194
+
195
+ Guidelines
196
+ ^^^^^^^^^^
197
+
198
+ - Follow PEP 8 style guidelines
199
+ - Write tests for new features
200
+ - Update documentation as needed
201
+ - Keep commits focused and atomic
202
+ - Be respectful in discussions
203
+
204
+ For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
@@ -1,18 +1,43 @@
1
- from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
2
- from datasketch.minhash import MinHash
1
+ import importlib.metadata
2
+ from typing import Final
3
+
4
+ try:
5
+ _version = importlib.metadata.version(__name__)
6
+ except importlib.metadata.PackageNotFoundError:
7
+ _version = "0.0.0" # Fallback for development mode
8
+ __version__: Final[str] = _version
9
+
3
10
  from datasketch.b_bit_minhash import bBitMinHash
11
+ from datasketch.hashfunc import sha1_hash32
12
+ from datasketch.hnsw import HNSW
13
+ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
14
+ from datasketch.lean_minhash import LeanMinHash
4
15
  from datasketch.lsh import MinHashLSH
5
16
  from datasketch.lsh_bloom import MinHashLSHBloom
6
- from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
7
- from datasketch.lshforest import MinHashLSHForest
8
17
  from datasketch.lshensemble import MinHashLSHEnsemble
9
- from datasketch.lean_minhash import LeanMinHash
10
- from datasketch.hashfunc import sha1_hash32
11
- from datasketch.hnsw import HNSW
18
+ from datasketch.lshforest import MinHashLSHForest
19
+ from datasketch.minhash import MinHash
20
+ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
12
21
 
13
22
  # Alias
14
23
  WeightedMinHashLSH = MinHashLSH
15
24
  WeightedMinHashLSHForest = MinHashLSHForest
16
25
 
17
- # Version
18
- from datasketch.version import __version__
26
+
27
+ __all__ = [
28
+ "HNSW",
29
+ "HyperLogLog",
30
+ "HyperLogLogPlusPlus",
31
+ "LeanMinHash",
32
+ "MinHash",
33
+ "MinHashLSH",
34
+ "MinHashLSHBloom",
35
+ "MinHashLSHEnsemble",
36
+ "MinHashLSHForest",
37
+ "WeightedMinHash",
38
+ "WeightedMinHashGenerator",
39
+ "WeightedMinHashLSH",
40
+ "WeightedMinHashLSHForest",
41
+ "bBitMinHash",
42
+ "sha1_hash32",
43
+ ]