datasketch 1.7.0__tar.gz → 1.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasketch-1.8.0/.gitignore +87 -0
- {datasketch-1.7.0/datasketch.egg-info → datasketch-1.8.0}/PKG-INFO +156 -53
- datasketch-1.7.0/PKG-INFO → datasketch-1.8.0/README.rst +118 -70
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/__init__.py +34 -9
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/b_bit_minhash.py +57 -72
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/__init__.py +4 -7
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/aio/lsh.py +101 -117
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/aio/storage.py +105 -107
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hashfunc.py +6 -3
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hnsw.py +103 -151
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hyperloglog.py +55 -66
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lean_minhash.py +25 -25
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lsh.py +189 -132
- datasketch-1.8.0/datasketch/lsh_bloom.py +377 -0
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lshensemble.py +37 -45
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lshensemble_partition.py +34 -33
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/lshforest.py +28 -36
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/minhash.py +171 -47
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/storage.py +175 -228
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/weighted_minhash.py +23 -25
- datasketch-1.8.0/pyproject.toml +162 -0
- datasketch-1.7.0/README.rst +0 -88
- datasketch-1.7.0/datasketch/lsh_bloom.py +0 -335
- datasketch-1.7.0/datasketch/version.py +0 -1
- datasketch-1.7.0/datasketch.egg-info/SOURCES.txt +0 -38
- datasketch-1.7.0/datasketch.egg-info/dependency_links.txt +0 -1
- datasketch-1.7.0/datasketch.egg-info/requires.txt +0 -43
- datasketch-1.7.0/datasketch.egg-info/top_level.txt +0 -1
- datasketch-1.7.0/setup.cfg +0 -4
- datasketch-1.7.0/setup.py +0 -91
- datasketch-1.7.0/test/test_hnsw.py +0 -306
- datasketch-1.7.0/test/test_hyperloglog.py +0 -177
- datasketch-1.7.0/test/test_lean_minhash.py +0 -190
- datasketch-1.7.0/test/test_lsh.py +0 -451
- datasketch-1.7.0/test/test_lsh_cassandra.py +0 -269
- datasketch-1.7.0/test/test_lshbloom.py +0 -126
- datasketch-1.7.0/test/test_lshensemble.py +0 -92
- datasketch-1.7.0/test/test_lshforest.py +0 -149
- datasketch-1.7.0/test/test_minhash.py +0 -203
- datasketch-1.7.0/test/test_weighted_minhash.py +0 -108
- {datasketch-1.7.0 → datasketch-1.8.0}/LICENSE +0 -0
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.7.0 → datasketch-1.8.0}/datasketch/hyperloglog_const.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
|
|
5
|
+
# C extensions
|
|
6
|
+
*.so
|
|
7
|
+
|
|
8
|
+
# VIM stuff
|
|
9
|
+
*.swp
|
|
10
|
+
|
|
11
|
+
# Distribution / packaging
|
|
12
|
+
.Python
|
|
13
|
+
env/
|
|
14
|
+
build/
|
|
15
|
+
develop-eggs/
|
|
16
|
+
dist/
|
|
17
|
+
downloads/
|
|
18
|
+
eggs/
|
|
19
|
+
.eggs/
|
|
20
|
+
lib/
|
|
21
|
+
lib64/
|
|
22
|
+
parts/
|
|
23
|
+
sdist/
|
|
24
|
+
var/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*,cover
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Django stuff:
|
|
54
|
+
*.log
|
|
55
|
+
|
|
56
|
+
# Sphinx documentation
|
|
57
|
+
docs/_build/
|
|
58
|
+
|
|
59
|
+
# PyBuilder
|
|
60
|
+
target/
|
|
61
|
+
|
|
62
|
+
# Documentation
|
|
63
|
+
.doctrees
|
|
64
|
+
_build
|
|
65
|
+
doctrees
|
|
66
|
+
|
|
67
|
+
# Jetbrains
|
|
68
|
+
.idea
|
|
69
|
+
|
|
70
|
+
# Benchmark files
|
|
71
|
+
benchmark/**/*.inp.gz
|
|
72
|
+
|
|
73
|
+
# Benchmark output
|
|
74
|
+
benchmark/**/*.sqlite
|
|
75
|
+
|
|
76
|
+
# Benchmark plots
|
|
77
|
+
benchmark/**/*.png
|
|
78
|
+
benchmark/**/*.pdf
|
|
79
|
+
|
|
80
|
+
# Virtual env
|
|
81
|
+
.venv
|
|
82
|
+
|
|
83
|
+
# IDE
|
|
84
|
+
.vscode
|
|
85
|
+
|
|
86
|
+
# MacOS
|
|
87
|
+
.DS_Store
|
|
@@ -1,67 +1,57 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
License: MIT
|
|
5
|
+
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
7
|
+
Project-URL: Documentation, https://ekzhu.github.io/datasketch
|
|
9
8
|
Project-URL: Source, https://github.com/ekzhu/datasketch
|
|
10
|
-
|
|
9
|
+
Author-email: ekzhu <ekzhu@cs.toronto.edu>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: database,datamining
|
|
11
13
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
14
|
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Topic :: Database
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
-
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Requires-Python: >=3.9
|
|
22
23
|
Requires-Dist: numpy>=1.11
|
|
23
24
|
Requires-Dist: scipy>=1.0.0
|
|
25
|
+
Provides-Extra: benchmark
|
|
26
|
+
Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
|
|
27
|
+
Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
|
|
28
|
+
Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
|
|
29
|
+
Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
|
|
30
|
+
Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
|
|
31
|
+
Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
|
|
32
|
+
Requires-Dist: scipy>=1.3.3; extra == 'benchmark'
|
|
33
|
+
Requires-Dist: setsimilaritysearch>=0.1.7; extra == 'benchmark'
|
|
34
|
+
Provides-Extra: bloom
|
|
35
|
+
Requires-Dist: pybloomfilter3>=0.7.2; extra == 'bloom'
|
|
24
36
|
Provides-Extra: cassandra
|
|
25
|
-
Requires-Dist: cassandra-driver>=3.20; extra ==
|
|
37
|
+
Requires-Dist: cassandra-driver>=3.20; extra == 'cassandra'
|
|
38
|
+
Provides-Extra: experimental-aio
|
|
39
|
+
Requires-Dist: aiounittest; extra == 'experimental-aio'
|
|
40
|
+
Requires-Dist: motor>3.6.0; extra == 'experimental-aio'
|
|
26
41
|
Provides-Extra: redis
|
|
27
|
-
Requires-Dist: redis>=2.10.0; extra ==
|
|
28
|
-
Provides-Extra: bloom
|
|
29
|
-
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
-
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
31
|
-
Provides-Extra: benchmark
|
|
32
|
-
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
33
|
-
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
34
|
-
Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
|
|
35
|
-
Requires-Dist: scipy>=1.3.3; extra == "benchmark"
|
|
36
|
-
Requires-Dist: pandas>=0.25.3; extra == "benchmark"
|
|
37
|
-
Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
|
|
38
|
-
Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
|
|
39
|
-
Requires-Dist: nltk>=3.4.5; extra == "benchmark"
|
|
42
|
+
Requires-Dist: redis>=2.10.0; extra == 'redis'
|
|
40
43
|
Provides-Extra: test
|
|
41
|
-
Requires-Dist: cassandra-driver>=3.20; extra ==
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
Requires-Dist: mock>=2.0.0; extra ==
|
|
44
|
-
Requires-Dist: mockredispy; extra ==
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist:
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist:
|
|
49
|
-
Requires-Dist: pytest; extra ==
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
-
Dynamic: author
|
|
54
|
-
Dynamic: author-email
|
|
55
|
-
Dynamic: classifier
|
|
56
|
-
Dynamic: description
|
|
57
|
-
Dynamic: home-page
|
|
58
|
-
Dynamic: keywords
|
|
59
|
-
Dynamic: license
|
|
60
|
-
Dynamic: license-file
|
|
61
|
-
Dynamic: project-url
|
|
62
|
-
Dynamic: provides-extra
|
|
63
|
-
Dynamic: requires-dist
|
|
64
|
-
Dynamic: summary
|
|
44
|
+
Requires-Dist: cassandra-driver>=3.20; extra == 'test'
|
|
45
|
+
Requires-Dist: coverage; extra == 'test'
|
|
46
|
+
Requires-Dist: mock>=2.0.0; extra == 'test'
|
|
47
|
+
Requires-Dist: mockredispy; extra == 'test'
|
|
48
|
+
Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
49
|
+
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
50
|
+
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
51
|
+
Requires-Dist: pytest; extra == 'test'
|
|
52
|
+
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
53
|
+
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
54
|
+
Description-Content-Type: text/x-rst
|
|
65
55
|
|
|
66
56
|
datasketch: Big Data Looks Small
|
|
67
57
|
================================
|
|
@@ -107,7 +97,7 @@ sub-linear query time:
|
|
|
107
97
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
108
98
|
+---------------------------+-----------------------------+------------------------+
|
|
109
99
|
|
|
110
|
-
datasketch must be used with Python 3.
|
|
100
|
+
datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
|
|
111
101
|
|
|
112
102
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
113
103
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -117,7 +107,7 @@ Install
|
|
|
117
107
|
|
|
118
108
|
To install datasketch using ``pip``:
|
|
119
109
|
|
|
120
|
-
::
|
|
110
|
+
.. code-block:: bash
|
|
121
111
|
|
|
122
112
|
pip install datasketch
|
|
123
113
|
|
|
@@ -125,19 +115,19 @@ This will also install NumPy as dependency.
|
|
|
125
115
|
|
|
126
116
|
To install with Redis dependency:
|
|
127
117
|
|
|
128
|
-
::
|
|
118
|
+
.. code-block:: bash
|
|
129
119
|
|
|
130
120
|
pip install datasketch[redis]
|
|
131
121
|
|
|
132
122
|
To install with Cassandra dependency:
|
|
133
123
|
|
|
134
|
-
::
|
|
124
|
+
.. code-block:: bash
|
|
135
125
|
|
|
136
126
|
pip install datasketch[cassandra]
|
|
137
127
|
|
|
138
128
|
To install with Bloom filter dependency:
|
|
139
129
|
|
|
140
|
-
::
|
|
130
|
+
.. code-block:: bash
|
|
141
131
|
|
|
142
132
|
pip install datasketch[bloom]
|
|
143
133
|
|
|
@@ -151,3 +141,116 @@ To install with Bloom filter dependency:
|
|
|
151
141
|
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
152
142
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
153
143
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
144
|
+
|
|
145
|
+
Contributing
|
|
146
|
+
------------
|
|
147
|
+
|
|
148
|
+
We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
|
|
149
|
+
|
|
150
|
+
Development Setup
|
|
151
|
+
^^^^^^^^^^^^^^^^^
|
|
152
|
+
|
|
153
|
+
The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
|
|
154
|
+
|
|
155
|
+
1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
|
|
156
|
+
|
|
157
|
+
2. **Clone the repository**:
|
|
158
|
+
|
|
159
|
+
.. code-block:: bash
|
|
160
|
+
|
|
161
|
+
git clone https://github.com/ekzhu/datasketch.git
|
|
162
|
+
cd datasketch
|
|
163
|
+
|
|
164
|
+
3. **Set up the environment**:
|
|
165
|
+
|
|
166
|
+
.. code-block:: bash
|
|
167
|
+
|
|
168
|
+
# Create a virtual environment
|
|
169
|
+
# (Optional: specify Python version with --python 3.x)
|
|
170
|
+
uv venv
|
|
171
|
+
# Activate the virtual environment (optional, uv run commands work without it)
|
|
172
|
+
source .venv/bin/activate
|
|
173
|
+
|
|
174
|
+
# Install all dependencies
|
|
175
|
+
uv sync
|
|
176
|
+
|
|
177
|
+
4. **Verify installation**:
|
|
178
|
+
|
|
179
|
+
.. code-block:: bash
|
|
180
|
+
|
|
181
|
+
# Run tests to ensure everything works
|
|
182
|
+
uv run pytest
|
|
183
|
+
|
|
184
|
+
5. **Optional dependencies** (for specific development needs):
|
|
185
|
+
|
|
186
|
+
.. code-block:: bash
|
|
187
|
+
|
|
188
|
+
# For testing
|
|
189
|
+
uv sync --extra test
|
|
190
|
+
|
|
191
|
+
# For Cassandra support
|
|
192
|
+
uv sync --extra cassandra
|
|
193
|
+
|
|
194
|
+
# For Redis support
|
|
195
|
+
uv sync --extra redis
|
|
196
|
+
|
|
197
|
+
# For all extras
|
|
198
|
+
uv sync --all-extras
|
|
199
|
+
|
|
200
|
+
Learn more about `uv` at https://docs.astral.sh/uv/
|
|
201
|
+
|
|
202
|
+
Development Workflow
|
|
203
|
+
^^^^^^^^^^^^^^^^^^^^
|
|
204
|
+
|
|
205
|
+
1. **Fork the repository** on GitHub if you haven't already.
|
|
206
|
+
|
|
207
|
+
2. **Create a feature branch** for your changes:
|
|
208
|
+
|
|
209
|
+
.. code-block:: bash
|
|
210
|
+
|
|
211
|
+
git checkout -b feature/your-feature-name
|
|
212
|
+
# Or for bug fixes:
|
|
213
|
+
git checkout -b fix/issue-description
|
|
214
|
+
|
|
215
|
+
3. **Make your changes** following the project's coding standards.
|
|
216
|
+
|
|
217
|
+
4. **Run the tests** to ensure nothing is broken:
|
|
218
|
+
|
|
219
|
+
.. code-block:: bash
|
|
220
|
+
|
|
221
|
+
uv run pytest
|
|
222
|
+
|
|
223
|
+
5. **Check code quality** with ruff:
|
|
224
|
+
|
|
225
|
+
.. code-block:: bash
|
|
226
|
+
|
|
227
|
+
# Check for issues
|
|
228
|
+
uvx ruff check .
|
|
229
|
+
|
|
230
|
+
# Auto-fix formatting issues
|
|
231
|
+
uvx ruff format .
|
|
232
|
+
|
|
233
|
+
6. **Commit your changes** with a clear, descriptive commit message:
|
|
234
|
+
|
|
235
|
+
.. code-block:: bash
|
|
236
|
+
|
|
237
|
+
git commit -m "Add feature: brief description of what was changed"
|
|
238
|
+
|
|
239
|
+
7. **Push to your fork** and create a pull request on GitHub:
|
|
240
|
+
|
|
241
|
+
.. code-block:: bash
|
|
242
|
+
|
|
243
|
+
git push origin your-branch-name
|
|
244
|
+
|
|
245
|
+
8. **Respond to feedback** from maintainers and iterate on your changes.
|
|
246
|
+
|
|
247
|
+
Guidelines
|
|
248
|
+
^^^^^^^^^^
|
|
249
|
+
|
|
250
|
+
- Follow PEP 8 style guidelines
|
|
251
|
+
- Write tests for new features
|
|
252
|
+
- Update documentation as needed
|
|
253
|
+
- Keep commits focused and atomic
|
|
254
|
+
- Be respectful in discussions
|
|
255
|
+
|
|
256
|
+
For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
|
|
@@ -1,68 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: datasketch
|
|
3
|
-
Version: 1.7.0
|
|
4
|
-
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
|
-
Home-page: https://ekzhu.github.io/datasketch
|
|
6
|
-
Author: ekzhu
|
|
7
|
-
Author-email: ekzhu@cs.toronto.edu
|
|
8
|
-
License: MIT
|
|
9
|
-
Project-URL: Source, https://github.com/ekzhu/datasketch
|
|
10
|
-
Keywords: database datamining
|
|
11
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
-
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Topic :: Database
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Requires-Dist: numpy>=1.11
|
|
23
|
-
Requires-Dist: scipy>=1.0.0
|
|
24
|
-
Provides-Extra: cassandra
|
|
25
|
-
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
|
-
Provides-Extra: redis
|
|
27
|
-
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
-
Provides-Extra: bloom
|
|
29
|
-
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
-
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
31
|
-
Provides-Extra: benchmark
|
|
32
|
-
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
33
|
-
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
34
|
-
Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
|
|
35
|
-
Requires-Dist: scipy>=1.3.3; extra == "benchmark"
|
|
36
|
-
Requires-Dist: pandas>=0.25.3; extra == "benchmark"
|
|
37
|
-
Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
|
|
38
|
-
Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
|
|
39
|
-
Requires-Dist: nltk>=3.4.5; extra == "benchmark"
|
|
40
|
-
Provides-Extra: test
|
|
41
|
-
Requires-Dist: cassandra-driver>=3.20; extra == "test"
|
|
42
|
-
Requires-Dist: redis>=2.10.0; extra == "test"
|
|
43
|
-
Requires-Dist: mock>=2.0.0; extra == "test"
|
|
44
|
-
Requires-Dist: mockredispy; extra == "test"
|
|
45
|
-
Requires-Dist: coverage; extra == "test"
|
|
46
|
-
Requires-Dist: pymongo>=3.9.0; extra == "test"
|
|
47
|
-
Requires-Dist: nose>=1.3.7; extra == "test"
|
|
48
|
-
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
49
|
-
Requires-Dist: pytest; extra == "test"
|
|
50
|
-
Provides-Extra: experimental-aio
|
|
51
|
-
Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
|
|
52
|
-
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
-
Dynamic: author
|
|
54
|
-
Dynamic: author-email
|
|
55
|
-
Dynamic: classifier
|
|
56
|
-
Dynamic: description
|
|
57
|
-
Dynamic: home-page
|
|
58
|
-
Dynamic: keywords
|
|
59
|
-
Dynamic: license
|
|
60
|
-
Dynamic: license-file
|
|
61
|
-
Dynamic: project-url
|
|
62
|
-
Dynamic: provides-extra
|
|
63
|
-
Dynamic: requires-dist
|
|
64
|
-
Dynamic: summary
|
|
65
|
-
|
|
66
1
|
datasketch: Big Data Looks Small
|
|
67
2
|
================================
|
|
68
3
|
|
|
@@ -107,7 +42,7 @@ sub-linear query time:
|
|
|
107
42
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
108
43
|
+---------------------------+-----------------------------+------------------------+
|
|
109
44
|
|
|
110
|
-
datasketch must be used with Python 3.
|
|
45
|
+
datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
|
|
111
46
|
|
|
112
47
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
113
48
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -117,7 +52,7 @@ Install
|
|
|
117
52
|
|
|
118
53
|
To install datasketch using ``pip``:
|
|
119
54
|
|
|
120
|
-
::
|
|
55
|
+
.. code-block:: bash
|
|
121
56
|
|
|
122
57
|
pip install datasketch
|
|
123
58
|
|
|
@@ -125,19 +60,19 @@ This will also install NumPy as dependency.
|
|
|
125
60
|
|
|
126
61
|
To install with Redis dependency:
|
|
127
62
|
|
|
128
|
-
::
|
|
63
|
+
.. code-block:: bash
|
|
129
64
|
|
|
130
65
|
pip install datasketch[redis]
|
|
131
66
|
|
|
132
67
|
To install with Cassandra dependency:
|
|
133
68
|
|
|
134
|
-
::
|
|
69
|
+
.. code-block:: bash
|
|
135
70
|
|
|
136
71
|
pip install datasketch[cassandra]
|
|
137
72
|
|
|
138
73
|
To install with Bloom filter dependency:
|
|
139
74
|
|
|
140
|
-
::
|
|
75
|
+
.. code-block:: bash
|
|
141
76
|
|
|
142
77
|
pip install datasketch[bloom]
|
|
143
78
|
|
|
@@ -151,3 +86,116 @@ To install with Bloom filter dependency:
|
|
|
151
86
|
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
152
87
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
153
88
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
89
|
+
|
|
90
|
+
Contributing
|
|
91
|
+
------------
|
|
92
|
+
|
|
93
|
+
We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
|
|
94
|
+
|
|
95
|
+
Development Setup
|
|
96
|
+
^^^^^^^^^^^^^^^^^
|
|
97
|
+
|
|
98
|
+
The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
|
|
99
|
+
|
|
100
|
+
1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
|
|
101
|
+
|
|
102
|
+
2. **Clone the repository**:
|
|
103
|
+
|
|
104
|
+
.. code-block:: bash
|
|
105
|
+
|
|
106
|
+
git clone https://github.com/ekzhu/datasketch.git
|
|
107
|
+
cd datasketch
|
|
108
|
+
|
|
109
|
+
3. **Set up the environment**:
|
|
110
|
+
|
|
111
|
+
.. code-block:: bash
|
|
112
|
+
|
|
113
|
+
# Create a virtual environment
|
|
114
|
+
# (Optional: specify Python version with --python 3.x)
|
|
115
|
+
uv venv
|
|
116
|
+
# Activate the virtual environment (optional, uv run commands work without it)
|
|
117
|
+
source .venv/bin/activate
|
|
118
|
+
|
|
119
|
+
# Install all dependencies
|
|
120
|
+
uv sync
|
|
121
|
+
|
|
122
|
+
4. **Verify installation**:
|
|
123
|
+
|
|
124
|
+
.. code-block:: bash
|
|
125
|
+
|
|
126
|
+
# Run tests to ensure everything works
|
|
127
|
+
uv run pytest
|
|
128
|
+
|
|
129
|
+
5. **Optional dependencies** (for specific development needs):
|
|
130
|
+
|
|
131
|
+
.. code-block:: bash
|
|
132
|
+
|
|
133
|
+
# For testing
|
|
134
|
+
uv sync --extra test
|
|
135
|
+
|
|
136
|
+
# For Cassandra support
|
|
137
|
+
uv sync --extra cassandra
|
|
138
|
+
|
|
139
|
+
# For Redis support
|
|
140
|
+
uv sync --extra redis
|
|
141
|
+
|
|
142
|
+
# For all extras
|
|
143
|
+
uv sync --all-extras
|
|
144
|
+
|
|
145
|
+
Learn more about `uv` at https://docs.astral.sh/uv/
|
|
146
|
+
|
|
147
|
+
Development Workflow
|
|
148
|
+
^^^^^^^^^^^^^^^^^^^^
|
|
149
|
+
|
|
150
|
+
1. **Fork the repository** on GitHub if you haven't already.
|
|
151
|
+
|
|
152
|
+
2. **Create a feature branch** for your changes:
|
|
153
|
+
|
|
154
|
+
.. code-block:: bash
|
|
155
|
+
|
|
156
|
+
git checkout -b feature/your-feature-name
|
|
157
|
+
# Or for bug fixes:
|
|
158
|
+
git checkout -b fix/issue-description
|
|
159
|
+
|
|
160
|
+
3. **Make your changes** following the project's coding standards.
|
|
161
|
+
|
|
162
|
+
4. **Run the tests** to ensure nothing is broken:
|
|
163
|
+
|
|
164
|
+
.. code-block:: bash
|
|
165
|
+
|
|
166
|
+
uv run pytest
|
|
167
|
+
|
|
168
|
+
5. **Check code quality** with ruff:
|
|
169
|
+
|
|
170
|
+
.. code-block:: bash
|
|
171
|
+
|
|
172
|
+
# Check for issues
|
|
173
|
+
uvx ruff check .
|
|
174
|
+
|
|
175
|
+
# Auto-fix formatting issues
|
|
176
|
+
uvx ruff format .
|
|
177
|
+
|
|
178
|
+
6. **Commit your changes** with a clear, descriptive commit message:
|
|
179
|
+
|
|
180
|
+
.. code-block:: bash
|
|
181
|
+
|
|
182
|
+
git commit -m "Add feature: brief description of what was changed"
|
|
183
|
+
|
|
184
|
+
7. **Push to your fork** and create a pull request on GitHub:
|
|
185
|
+
|
|
186
|
+
.. code-block:: bash
|
|
187
|
+
|
|
188
|
+
git push origin your-branch-name
|
|
189
|
+
|
|
190
|
+
8. **Respond to feedback** from maintainers and iterate on your changes.
|
|
191
|
+
|
|
192
|
+
Guidelines
|
|
193
|
+
^^^^^^^^^^
|
|
194
|
+
|
|
195
|
+
- Follow PEP 8 style guidelines
|
|
196
|
+
- Write tests for new features
|
|
197
|
+
- Update documentation as needed
|
|
198
|
+
- Keep commits focused and atomic
|
|
199
|
+
- Be respectful in discussions
|
|
200
|
+
|
|
201
|
+
For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
|
|
@@ -1,18 +1,43 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
from typing import Final
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
_version = importlib.metadata.version(__name__)
|
|
6
|
+
except importlib.metadata.PackageNotFoundError:
|
|
7
|
+
_version = "0.0.0" # Fallback for development mode
|
|
8
|
+
__version__: Final[str] = _version
|
|
9
|
+
|
|
3
10
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
11
|
+
from datasketch.hashfunc import sha1_hash32
|
|
12
|
+
from datasketch.hnsw import HNSW
|
|
13
|
+
from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
14
|
+
from datasketch.lean_minhash import LeanMinHash
|
|
4
15
|
from datasketch.lsh import MinHashLSH
|
|
5
16
|
from datasketch.lsh_bloom import MinHashLSHBloom
|
|
6
|
-
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
7
|
-
from datasketch.lshforest import MinHashLSHForest
|
|
8
17
|
from datasketch.lshensemble import MinHashLSHEnsemble
|
|
9
|
-
from datasketch.
|
|
10
|
-
from datasketch.
|
|
11
|
-
from datasketch.
|
|
18
|
+
from datasketch.lshforest import MinHashLSHForest
|
|
19
|
+
from datasketch.minhash import MinHash
|
|
20
|
+
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
12
21
|
|
|
13
22
|
# Alias
|
|
14
23
|
WeightedMinHashLSH = MinHashLSH
|
|
15
24
|
WeightedMinHashLSHForest = MinHashLSHForest
|
|
16
25
|
|
|
17
|
-
|
|
18
|
-
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"HNSW",
|
|
29
|
+
"HyperLogLog",
|
|
30
|
+
"HyperLogLogPlusPlus",
|
|
31
|
+
"LeanMinHash",
|
|
32
|
+
"MinHash",
|
|
33
|
+
"MinHashLSH",
|
|
34
|
+
"MinHashLSHBloom",
|
|
35
|
+
"MinHashLSHEnsemble",
|
|
36
|
+
"MinHashLSHForest",
|
|
37
|
+
"WeightedMinHash",
|
|
38
|
+
"WeightedMinHashGenerator",
|
|
39
|
+
"WeightedMinHashLSH",
|
|
40
|
+
"WeightedMinHashLSHForest",
|
|
41
|
+
"bBitMinHash",
|
|
42
|
+
"sha1_hash32",
|
|
43
|
+
]
|