datasketch 1.6.5__tar.gz → 1.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datasketch-1.8.0/.gitignore +87 -0
  2. datasketch-1.8.0/PKG-INFO +256 -0
  3. datasketch-1.6.5/PKG-INFO → datasketch-1.8.0/README.rst +125 -54
  4. datasketch-1.8.0/datasketch/__init__.py +43 -0
  5. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/b_bit_minhash.py +59 -67
  6. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/__init__.py +4 -7
  7. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/aio/lsh.py +101 -117
  8. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/aio/storage.py +105 -107
  9. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hashfunc.py +6 -3
  10. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hnsw.py +103 -151
  11. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hyperloglog.py +55 -66
  12. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lean_minhash.py +25 -25
  13. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lsh.py +189 -132
  14. datasketch-1.8.0/datasketch/lsh_bloom.py +377 -0
  15. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lshensemble.py +37 -45
  16. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lshensemble_partition.py +34 -33
  17. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lshforest.py +28 -36
  18. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/minhash.py +171 -47
  19. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/storage.py +175 -228
  20. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/weighted_minhash.py +23 -25
  21. datasketch-1.8.0/pyproject.toml +162 -0
  22. datasketch-1.6.5/README.rst +0 -80
  23. datasketch-1.6.5/datasketch/__init__.py +0 -17
  24. datasketch-1.6.5/datasketch/version.py +0 -1
  25. datasketch-1.6.5/datasketch.egg-info/PKG-INFO +0 -130
  26. datasketch-1.6.5/datasketch.egg-info/SOURCES.txt +0 -36
  27. datasketch-1.6.5/datasketch.egg-info/dependency_links.txt +0 -1
  28. datasketch-1.6.5/datasketch.egg-info/requires.txt +0 -35
  29. datasketch-1.6.5/datasketch.egg-info/top_level.txt +0 -1
  30. datasketch-1.6.5/setup.cfg +0 -4
  31. datasketch-1.6.5/setup.py +0 -87
  32. datasketch-1.6.5/test/test_hnsw.py +0 -306
  33. datasketch-1.6.5/test/test_hyperloglog.py +0 -177
  34. datasketch-1.6.5/test/test_lean_minhash.py +0 -190
  35. datasketch-1.6.5/test/test_lsh.py +0 -451
  36. datasketch-1.6.5/test/test_lsh_cassandra.py +0 -269
  37. datasketch-1.6.5/test/test_lshensemble.py +0 -92
  38. datasketch-1.6.5/test/test_lshforest.py +0 -149
  39. datasketch-1.6.5/test/test_minhash.py +0 -203
  40. datasketch-1.6.5/test/test_weighted_minhash.py +0 -108
  41. {datasketch-1.6.5 → datasketch-1.8.0}/LICENSE +0 -0
  42. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/aio/__init__.py +0 -0
  43. {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hyperloglog_const.py +0 -0
@@ -0,0 +1,87 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # C extensions
6
+ *.so
7
+
8
+ # VIM stuff
9
+ *.swp
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ env/
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *,cover
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Django stuff:
54
+ *.log
55
+
56
+ # Sphinx documentation
57
+ docs/_build/
58
+
59
+ # PyBuilder
60
+ target/
61
+
62
+ # Documentation
63
+ .doctrees
64
+ _build
65
+ doctrees
66
+
67
+ # Jetbrains
68
+ .idea
69
+
70
+ # Benchmark files
71
+ benchmark/**/*.inp.gz
72
+
73
+ # Benchmark output
74
+ benchmark/**/*.sqlite
75
+
76
+ # Benchmark plots
77
+ benchmark/**/*.png
78
+ benchmark/**/*.pdf
79
+
80
+ # Virtual env
81
+ .venv
82
+
83
+ # IDE
84
+ .vscode
85
+
86
+ # MacOS
87
+ .DS_Store
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasketch
3
+ Version: 1.8.0
4
+ Summary: Probabilistic data structures for processing and searching very large datasets
5
+ Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
+ Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
7
+ Project-URL: Documentation, https://ekzhu.github.io/datasketch
8
+ Project-URL: Source, https://github.com/ekzhu/datasketch
9
+ Author-email: ekzhu <ekzhu@cs.toronto.edu>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: database,datamining
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
+ Requires-Python: >=3.9
23
+ Requires-Dist: numpy>=1.11
24
+ Requires-Dist: scipy>=1.0.0
25
+ Provides-Extra: benchmark
26
+ Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
27
+ Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
28
+ Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
29
+ Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
30
+ Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
31
+ Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
32
+ Requires-Dist: scipy>=1.3.3; extra == 'benchmark'
33
+ Requires-Dist: setsimilaritysearch>=0.1.7; extra == 'benchmark'
34
+ Provides-Extra: bloom
35
+ Requires-Dist: pybloomfilter3>=0.7.2; extra == 'bloom'
36
+ Provides-Extra: cassandra
37
+ Requires-Dist: cassandra-driver>=3.20; extra == 'cassandra'
38
+ Provides-Extra: experimental-aio
39
+ Requires-Dist: aiounittest; extra == 'experimental-aio'
40
+ Requires-Dist: motor>3.6.0; extra == 'experimental-aio'
41
+ Provides-Extra: redis
42
+ Requires-Dist: redis>=2.10.0; extra == 'redis'
43
+ Provides-Extra: test
44
+ Requires-Dist: cassandra-driver>=3.20; extra == 'test'
45
+ Requires-Dist: coverage; extra == 'test'
46
+ Requires-Dist: mock>=2.0.0; extra == 'test'
47
+ Requires-Dist: mockredispy; extra == 'test'
48
+ Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
49
+ Requires-Dist: nose>=1.3.7; extra == 'test'
50
+ Requires-Dist: pymongo>=3.9.0; extra == 'test'
51
+ Requires-Dist: pytest; extra == 'test'
52
+ Requires-Dist: pytest-rerunfailures; extra == 'test'
53
+ Requires-Dist: redis>=2.10.0; extra == 'test'
54
+ Description-Content-Type: text/x-rst
55
+
56
+ datasketch: Big Data Looks Small
57
+ ================================
58
+
59
+ .. image:: https://static.pepy.tech/badge/datasketch/month
60
+ :target: https://pepy.tech/project/datasketch
61
+
62
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
63
+ :target: https://zenodo.org/doi/10.5281/zenodo.598238
64
+
65
+ datasketch gives you probabilistic data structures that can process and
66
+ search very large amount of data super fast, with little loss of
67
+ accuracy.
68
+
69
+ This package contains the following data sketches:
70
+
71
+ +-------------------------+-----------------------------------------------+
72
+ | Data Sketch | Usage |
73
+ +=========================+===============================================+
74
+ | `MinHash`_ | estimate Jaccard similarity and cardinality |
75
+ +-------------------------+-----------------------------------------------+
76
+ | `Weighted MinHash`_ | estimate weighted Jaccard similarity |
77
+ +-------------------------+-----------------------------------------------+
78
+ | `HyperLogLog`_ | estimate cardinality |
79
+ +-------------------------+-----------------------------------------------+
80
+ | `HyperLogLog++`_ | estimate cardinality |
81
+ +-------------------------+-----------------------------------------------+
82
+
83
+ The following indexes for data sketches are provided to support
84
+ sub-linear query time:
85
+
86
+ +---------------------------+-----------------------------+------------------------+
87
+ | Index | For Data Sketch | Supported Query Type |
88
+ +===========================+=============================+========================+
89
+ | `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
90
+ +---------------------------+-----------------------------+------------------------+
91
+ | `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
92
+ +---------------------------+-----------------------------+------------------------+
93
+ | `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
94
+ +---------------------------+-----------------------------+------------------------+
95
+ | `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
96
+ +---------------------------+-----------------------------+------------------------+
97
+ | `HNSW`_ | Any | Custom Metric Top-K |
98
+ +---------------------------+-----------------------------+------------------------+
99
+
100
+ datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
101
+
102
+ Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
103
+ storage layer (see `MinHash LSH at Scale`_).
104
+
105
+ Install
106
+ -------
107
+
108
+ To install datasketch using ``pip``:
109
+
110
+ .. code-block:: bash
111
+
112
+ pip install datasketch
113
+
114
+ This will also install NumPy as dependency.
115
+
116
+ To install with Redis dependency:
117
+
118
+ .. code-block:: bash
119
+
120
+ pip install datasketch[redis]
121
+
122
+ To install with Cassandra dependency:
123
+
124
+ .. code-block:: bash
125
+
126
+ pip install datasketch[cassandra]
127
+
128
+ To install with Bloom filter dependency:
129
+
130
+ .. code-block:: bash
131
+
132
+ pip install datasketch[bloom]
133
+
134
+ .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
135
+ .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
136
+ .. _`HyperLogLog`: https://ekzhu.github.io/datasketch/hyperloglog.html
137
+ .. _`HyperLogLog++`: https://ekzhu.github.io/datasketch/hyperloglog.html#hyperloglog-plusplus
138
+ .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
139
+ .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
140
+ .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
141
+ .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
142
+ .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
143
+ .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
144
+
145
+ Contributing
146
+ ------------
147
+
148
+ We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
149
+
150
+ Development Setup
151
+ ^^^^^^^^^^^^^^^^^
152
+
153
+ The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
154
+
155
+ 1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
156
+
157
+ 2. **Clone the repository**:
158
+
159
+ .. code-block:: bash
160
+
161
+ git clone https://github.com/ekzhu/datasketch.git
162
+ cd datasketch
163
+
164
+ 3. **Set up the environment**:
165
+
166
+ .. code-block:: bash
167
+
168
+ # Create a virtual environment
169
+ # (Optional: specify Python version with --python 3.x)
170
+ uv venv
171
+ # Activate the virtual environment (optional, uv run commands work without it)
172
+ source .venv/bin/activate
173
+
174
+ # Install all dependencies
175
+ uv sync
176
+
177
+ 4. **Verify installation**:
178
+
179
+ .. code-block:: bash
180
+
181
+ # Run tests to ensure everything works
182
+ uv run pytest
183
+
184
+ 5. **Optional dependencies** (for specific development needs):
185
+
186
+ .. code-block:: bash
187
+
188
+ # For testing
189
+ uv sync --extra test
190
+
191
+ # For Cassandra support
192
+ uv sync --extra cassandra
193
+
194
+ # For Redis support
195
+ uv sync --extra redis
196
+
197
+ # For all extras
198
+ uv sync --all-extras
199
+
200
+ Learn more about `uv` at https://docs.astral.sh/uv/
201
+
202
+ Development Workflow
203
+ ^^^^^^^^^^^^^^^^^^^^
204
+
205
+ 1. **Fork the repository** on GitHub if you haven't already.
206
+
207
+ 2. **Create a feature branch** for your changes:
208
+
209
+ .. code-block:: bash
210
+
211
+ git checkout -b feature/your-feature-name
212
+ # Or for bug fixes:
213
+ git checkout -b fix/issue-description
214
+
215
+ 3. **Make your changes** following the project's coding standards.
216
+
217
+ 4. **Run the tests** to ensure nothing is broken:
218
+
219
+ .. code-block:: bash
220
+
221
+ uv run pytest
222
+
223
+ 5. **Check code quality** with ruff:
224
+
225
+ .. code-block:: bash
226
+
227
+ # Check for issues
228
+ uvx ruff check .
229
+
230
+ # Auto-fix formatting issues
231
+ uvx ruff format .
232
+
233
+ 6. **Commit your changes** with a clear, descriptive commit message:
234
+
235
+ .. code-block:: bash
236
+
237
+ git commit -m "Add feature: brief description of what was changed"
238
+
239
+ 7. **Push to your fork** and create a pull request on GitHub:
240
+
241
+ .. code-block:: bash
242
+
243
+ git push origin your-branch-name
244
+
245
+ 8. **Respond to feedback** from maintainers and iterate on your changes.
246
+
247
+ Guidelines
248
+ ^^^^^^^^^^
249
+
250
+ - Follow PEP 8 style guidelines
251
+ - Write tests for new features
252
+ - Update documentation as needed
253
+ - Keep commits focused and atomic
254
+ - Be respectful in discussions
255
+
256
+ For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
@@ -1,53 +1,3 @@
1
- Metadata-Version: 2.1
2
- Name: datasketch
3
- Version: 1.6.5
4
- Summary: Probabilistic data structures for processing and searching very large datasets
5
- Home-page: https://ekzhu.github.io/datasketch
6
- Author: ekzhu
7
- Author-email: ekzhu@cs.toronto.edu
8
- License: MIT
9
- Project-URL: Source, https://github.com/ekzhu/datasketch
10
- Keywords: database datamining
11
- Classifier: Development Status :: 5 - Production/Stable
12
- Classifier: Intended Audience :: Developers
13
- Classifier: Topic :: Database
14
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
- Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Programming Language :: Python :: 3.7
17
- Classifier: Programming Language :: Python :: 3.8
18
- Classifier: Programming Language :: Python :: 3.9
19
- Classifier: Programming Language :: Python :: 3.10
20
- Classifier: Programming Language :: Python :: 3.11
21
- License-File: LICENSE
22
- Requires-Dist: numpy>=1.11
23
- Requires-Dist: scipy>=1.0.0
24
- Provides-Extra: cassandra
25
- Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
26
- Provides-Extra: redis
27
- Requires-Dist: redis>=2.10.0; extra == "redis"
28
- Provides-Extra: benchmark
29
- Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
30
- Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
31
- Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
32
- Requires-Dist: scipy>=1.3.3; extra == "benchmark"
33
- Requires-Dist: pandas>=0.25.3; extra == "benchmark"
34
- Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
35
- Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
36
- Requires-Dist: nltk>=3.4.5; extra == "benchmark"
37
- Provides-Extra: test
38
- Requires-Dist: cassandra-driver>=3.20; extra == "test"
39
- Requires-Dist: redis>=2.10.0; extra == "test"
40
- Requires-Dist: mock>=2.0.0; extra == "test"
41
- Requires-Dist: mockredispy; extra == "test"
42
- Requires-Dist: coverage; extra == "test"
43
- Requires-Dist: pymongo>=3.9.0; extra == "test"
44
- Requires-Dist: nose>=1.3.7; extra == "test"
45
- Requires-Dist: nose-exclude>=0.5.0; extra == "test"
46
- Requires-Dist: pytest; extra == "test"
47
- Provides-Extra: experimental-aio
48
- Requires-Dist: aiounittest; python_version >= "3.6" and extra == "experimental-aio"
49
- Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
50
-
51
1
  datasketch: Big Data Looks Small
52
2
  ================================
53
3
 
@@ -83,6 +33,8 @@ sub-linear query time:
83
33
  +===========================+=============================+========================+
84
34
  | `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
85
35
  +---------------------------+-----------------------------+------------------------+
36
+ | `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
37
+ +---------------------------+-----------------------------+------------------------+
86
38
  | `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
87
39
  +---------------------------+-----------------------------+------------------------+
88
40
  | `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
@@ -90,7 +42,7 @@ sub-linear query time:
90
42
  | `HNSW`_ | Any | Custom Metric Top-K |
91
43
  +---------------------------+-----------------------------+------------------------+
92
44
 
93
- datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
45
+ datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
94
46
 
95
47
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
96
48
  storage layer (see `MinHash LSH at Scale`_).
@@ -100,7 +52,7 @@ Install
100
52
 
101
53
  To install datasketch using ``pip``:
102
54
 
103
- ::
55
+ .. code-block:: bash
104
56
 
105
57
  pip install datasketch
106
58
 
@@ -108,16 +60,21 @@ This will also install NumPy as dependency.
108
60
 
109
61
  To install with Redis dependency:
110
62
 
111
- ::
63
+ .. code-block:: bash
112
64
 
113
65
  pip install datasketch[redis]
114
66
 
115
67
  To install with Cassandra dependency:
116
68
 
117
- ::
69
+ .. code-block:: bash
118
70
 
119
71
  pip install datasketch[cassandra]
120
72
 
73
+ To install with Bloom filter dependency:
74
+
75
+ .. code-block:: bash
76
+
77
+ pip install datasketch[bloom]
121
78
 
122
79
  .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
123
80
  .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -126,5 +83,119 @@ To install with Cassandra dependency:
126
83
  .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
127
84
  .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
128
85
  .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
86
+ .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
129
87
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
130
88
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
89
+
90
+ Contributing
91
+ ------------
92
+
93
+ We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
94
+
95
+ Development Setup
96
+ ^^^^^^^^^^^^^^^^^
97
+
98
+ The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
99
+
100
+ 1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
101
+
102
+ 2. **Clone the repository**:
103
+
104
+ .. code-block:: bash
105
+
106
+ git clone https://github.com/ekzhu/datasketch.git
107
+ cd datasketch
108
+
109
+ 3. **Set up the environment**:
110
+
111
+ .. code-block:: bash
112
+
113
+ # Create a virtual environment
114
+ # (Optional: specify Python version with --python 3.x)
115
+ uv venv
116
+ # Activate the virtual environment (optional, uv run commands work without it)
117
+ source .venv/bin/activate
118
+
119
+ # Install all dependencies
120
+ uv sync
121
+
122
+ 4. **Verify installation**:
123
+
124
+ .. code-block:: bash
125
+
126
+ # Run tests to ensure everything works
127
+ uv run pytest
128
+
129
+ 5. **Optional dependencies** (for specific development needs):
130
+
131
+ .. code-block:: bash
132
+
133
+ # For testing
134
+ uv sync --extra test
135
+
136
+ # For Cassandra support
137
+ uv sync --extra cassandra
138
+
139
+ # For Redis support
140
+ uv sync --extra redis
141
+
142
+ # For all extras
143
+ uv sync --all-extras
144
+
145
+ Learn more about `uv` at https://docs.astral.sh/uv/
146
+
147
+ Development Workflow
148
+ ^^^^^^^^^^^^^^^^^^^^
149
+
150
+ 1. **Fork the repository** on GitHub if you haven't already.
151
+
152
+ 2. **Create a feature branch** for your changes:
153
+
154
+ .. code-block:: bash
155
+
156
+ git checkout -b feature/your-feature-name
157
+ # Or for bug fixes:
158
+ git checkout -b fix/issue-description
159
+
160
+ 3. **Make your changes** following the project's coding standards.
161
+
162
+ 4. **Run the tests** to ensure nothing is broken:
163
+
164
+ .. code-block:: bash
165
+
166
+ uv run pytest
167
+
168
+ 5. **Check code quality** with ruff:
169
+
170
+ .. code-block:: bash
171
+
172
+ # Check for issues
173
+ uvx ruff check .
174
+
175
+ # Auto-fix formatting issues
176
+ uvx ruff format .
177
+
178
+ 6. **Commit your changes** with a clear, descriptive commit message:
179
+
180
+ .. code-block:: bash
181
+
182
+ git commit -m "Add feature: brief description of what was changed"
183
+
184
+ 7. **Push to your fork** and create a pull request on GitHub:
185
+
186
+ .. code-block:: bash
187
+
188
+ git push origin your-branch-name
189
+
190
+ 8. **Respond to feedback** from maintainers and iterate on your changes.
191
+
192
+ Guidelines
193
+ ^^^^^^^^^^
194
+
195
+ - Follow PEP 8 style guidelines
196
+ - Write tests for new features
197
+ - Update documentation as needed
198
+ - Keep commits focused and atomic
199
+ - Be respectful in discussions
200
+
201
+ For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
@@ -0,0 +1,43 @@
1
+ import importlib.metadata
2
+ from typing import Final
3
+
4
+ try:
5
+ _version = importlib.metadata.version(__name__)
6
+ except importlib.metadata.PackageNotFoundError:
7
+ _version = "0.0.0" # Fallback for development mode
8
+ __version__: Final[str] = _version
9
+
10
+ from datasketch.b_bit_minhash import bBitMinHash
11
+ from datasketch.hashfunc import sha1_hash32
12
+ from datasketch.hnsw import HNSW
13
+ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
14
+ from datasketch.lean_minhash import LeanMinHash
15
+ from datasketch.lsh import MinHashLSH
16
+ from datasketch.lsh_bloom import MinHashLSHBloom
17
+ from datasketch.lshensemble import MinHashLSHEnsemble
18
+ from datasketch.lshforest import MinHashLSHForest
19
+ from datasketch.minhash import MinHash
20
+ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
21
+
22
+ # Alias
23
+ WeightedMinHashLSH = MinHashLSH
24
+ WeightedMinHashLSHForest = MinHashLSHForest
25
+
26
+
27
+ __all__ = [
28
+ "HNSW",
29
+ "HyperLogLog",
30
+ "HyperLogLogPlusPlus",
31
+ "LeanMinHash",
32
+ "MinHash",
33
+ "MinHashLSH",
34
+ "MinHashLSHBloom",
35
+ "MinHashLSHEnsemble",
36
+ "MinHashLSHForest",
37
+ "WeightedMinHash",
38
+ "WeightedMinHashGenerator",
39
+ "WeightedMinHashLSH",
40
+ "WeightedMinHashLSHForest",
41
+ "bBitMinHash",
42
+ "sha1_hash32",
43
+ ]