datasketch 1.7.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasketch-1.9.0/.gitignore +87 -0
- datasketch-1.9.0/PKG-INFO +262 -0
- datasketch-1.7.0/PKG-INFO → datasketch-1.9.0/README.rst +121 -70
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/__init__.py +34 -9
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/b_bit_minhash.py +57 -72
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/__init__.py +4 -7
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/aio/lsh.py +110 -118
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/aio/storage.py +138 -122
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hashfunc.py +6 -3
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hnsw.py +103 -151
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hyperloglog.py +55 -66
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lean_minhash.py +25 -25
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lsh.py +199 -134
- datasketch-1.9.0/datasketch/lsh_bloom.py +377 -0
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lshensemble.py +39 -46
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lshensemble_partition.py +34 -33
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/lshforest.py +28 -36
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/minhash.py +176 -49
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/storage.py +188 -233
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/weighted_minhash.py +28 -30
- datasketch-1.9.0/pyproject.toml +197 -0
- datasketch-1.7.0/README.rst +0 -88
- datasketch-1.7.0/datasketch/lsh_bloom.py +0 -335
- datasketch-1.7.0/datasketch/version.py +0 -1
- datasketch-1.7.0/datasketch.egg-info/PKG-INFO +0 -153
- datasketch-1.7.0/datasketch.egg-info/SOURCES.txt +0 -38
- datasketch-1.7.0/datasketch.egg-info/dependency_links.txt +0 -1
- datasketch-1.7.0/datasketch.egg-info/requires.txt +0 -43
- datasketch-1.7.0/datasketch.egg-info/top_level.txt +0 -1
- datasketch-1.7.0/setup.cfg +0 -4
- datasketch-1.7.0/setup.py +0 -91
- datasketch-1.7.0/test/test_hnsw.py +0 -306
- datasketch-1.7.0/test/test_hyperloglog.py +0 -177
- datasketch-1.7.0/test/test_lean_minhash.py +0 -190
- datasketch-1.7.0/test/test_lsh.py +0 -451
- datasketch-1.7.0/test/test_lsh_cassandra.py +0 -269
- datasketch-1.7.0/test/test_lshbloom.py +0 -126
- datasketch-1.7.0/test/test_lshensemble.py +0 -92
- datasketch-1.7.0/test/test_lshforest.py +0 -149
- datasketch-1.7.0/test/test_minhash.py +0 -203
- datasketch-1.7.0/test/test_weighted_minhash.py +0 -108
- {datasketch-1.7.0 → datasketch-1.9.0}/LICENSE +0 -0
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.7.0 → datasketch-1.9.0}/datasketch/hyperloglog_const.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
|
|
5
|
+
# C extensions
|
|
6
|
+
*.so
|
|
7
|
+
|
|
8
|
+
# VIM stuff
|
|
9
|
+
*.swp
|
|
10
|
+
|
|
11
|
+
# Distribution / packaging
|
|
12
|
+
.Python
|
|
13
|
+
env/
|
|
14
|
+
build/
|
|
15
|
+
develop-eggs/
|
|
16
|
+
dist/
|
|
17
|
+
downloads/
|
|
18
|
+
eggs/
|
|
19
|
+
.eggs/
|
|
20
|
+
lib/
|
|
21
|
+
lib64/
|
|
22
|
+
parts/
|
|
23
|
+
sdist/
|
|
24
|
+
var/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*,cover
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Django stuff:
|
|
54
|
+
*.log
|
|
55
|
+
|
|
56
|
+
# Sphinx documentation
|
|
57
|
+
docs/_build/
|
|
58
|
+
|
|
59
|
+
# PyBuilder
|
|
60
|
+
target/
|
|
61
|
+
|
|
62
|
+
# Documentation
|
|
63
|
+
.doctrees
|
|
64
|
+
_build
|
|
65
|
+
doctrees
|
|
66
|
+
|
|
67
|
+
# Jetbrains
|
|
68
|
+
.idea
|
|
69
|
+
|
|
70
|
+
# Benchmark files
|
|
71
|
+
benchmark/**/*.inp.gz
|
|
72
|
+
|
|
73
|
+
# Benchmark output
|
|
74
|
+
benchmark/**/*.sqlite
|
|
75
|
+
|
|
76
|
+
# Benchmark plots
|
|
77
|
+
benchmark/**/*.png
|
|
78
|
+
benchmark/**/*.pdf
|
|
79
|
+
|
|
80
|
+
# Virtual env
|
|
81
|
+
.venv
|
|
82
|
+
|
|
83
|
+
# IDE
|
|
84
|
+
.vscode
|
|
85
|
+
|
|
86
|
+
# MacOS
|
|
87
|
+
.DS_Store
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datasketch
|
|
3
|
+
Version: 1.9.0
|
|
4
|
+
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
|
+
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
7
|
+
Project-URL: Documentation, https://ekzhu.github.io/datasketch
|
|
8
|
+
Project-URL: Source, https://github.com/ekzhu/datasketch
|
|
9
|
+
Author-email: ekzhu <ekzhu@cs.toronto.edu>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: database,datamining
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Database
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: numpy>=1.11
|
|
25
|
+
Requires-Dist: scipy>=1.0.0
|
|
26
|
+
Provides-Extra: benchmark
|
|
27
|
+
Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
|
|
28
|
+
Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
|
|
29
|
+
Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
|
|
30
|
+
Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
|
|
31
|
+
Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
|
|
32
|
+
Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
|
|
33
|
+
Requires-Dist: scipy>=1.3.3; extra == 'benchmark'
|
|
34
|
+
Requires-Dist: setsimilaritysearch>=0.1.7; extra == 'benchmark'
|
|
35
|
+
Provides-Extra: bloom
|
|
36
|
+
Requires-Dist: pybloomfilter3>=0.7.2; extra == 'bloom'
|
|
37
|
+
Provides-Extra: cassandra
|
|
38
|
+
Requires-Dist: cassandra-driver>=3.20; extra == 'cassandra'
|
|
39
|
+
Provides-Extra: experimental-aio
|
|
40
|
+
Requires-Dist: aiounittest; extra == 'experimental-aio'
|
|
41
|
+
Requires-Dist: motor>3.6.0; extra == 'experimental-aio'
|
|
42
|
+
Provides-Extra: redis
|
|
43
|
+
Requires-Dist: redis>=2.10.0; extra == 'redis'
|
|
44
|
+
Provides-Extra: test
|
|
45
|
+
Requires-Dist: cassandra-driver>=3.20; extra == 'test'
|
|
46
|
+
Requires-Dist: coverage; extra == 'test'
|
|
47
|
+
Requires-Dist: mock>=2.0.0; extra == 'test'
|
|
48
|
+
Requires-Dist: mockredispy; extra == 'test'
|
|
49
|
+
Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
50
|
+
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
51
|
+
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
52
|
+
Requires-Dist: pytest; extra == 'test'
|
|
53
|
+
Requires-Dist: pytest-asyncio; extra == 'test'
|
|
54
|
+
Requires-Dist: pytest-cov; extra == 'test'
|
|
55
|
+
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
56
|
+
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
57
|
+
Description-Content-Type: text/x-rst
|
|
58
|
+
|
|
59
|
+
datasketch: Big Data Looks Small
|
|
60
|
+
================================
|
|
61
|
+
|
|
62
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
63
|
+
:target: https://pepy.tech/project/datasketch
|
|
64
|
+
|
|
65
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
66
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
67
|
+
|
|
68
|
+
.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
|
|
69
|
+
:target: https://codecov.io/gh/ekzhu/datasketch
|
|
70
|
+
|
|
71
|
+
datasketch gives you probabilistic data structures that can process and
|
|
72
|
+
search very large amount of data super fast, with little loss of
|
|
73
|
+
accuracy.
|
|
74
|
+
|
|
75
|
+
This package contains the following data sketches:
|
|
76
|
+
|
|
77
|
+
+-------------------------+-----------------------------------------------+
|
|
78
|
+
| Data Sketch | Usage |
|
|
79
|
+
+=========================+===============================================+
|
|
80
|
+
| `MinHash`_ | estimate Jaccard similarity and cardinality |
|
|
81
|
+
+-------------------------+-----------------------------------------------+
|
|
82
|
+
| `Weighted MinHash`_ | estimate weighted Jaccard similarity |
|
|
83
|
+
+-------------------------+-----------------------------------------------+
|
|
84
|
+
| `HyperLogLog`_ | estimate cardinality |
|
|
85
|
+
+-------------------------+-----------------------------------------------+
|
|
86
|
+
| `HyperLogLog++`_ | estimate cardinality |
|
|
87
|
+
+-------------------------+-----------------------------------------------+
|
|
88
|
+
|
|
89
|
+
The following indexes for data sketches are provided to support
|
|
90
|
+
sub-linear query time:
|
|
91
|
+
|
|
92
|
+
+---------------------------+-----------------------------+------------------------+
|
|
93
|
+
| Index | For Data Sketch | Supported Query Type |
|
|
94
|
+
+===========================+=============================+========================+
|
|
95
|
+
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
96
|
+
+---------------------------+-----------------------------+------------------------+
|
|
97
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
98
|
+
+---------------------------+-----------------------------+------------------------+
|
|
99
|
+
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
100
|
+
+---------------------------+-----------------------------+------------------------+
|
|
101
|
+
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
102
|
+
+---------------------------+-----------------------------+------------------------+
|
|
103
|
+
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
104
|
+
+---------------------------+-----------------------------+------------------------+
|
|
105
|
+
|
|
106
|
+
datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
|
|
107
|
+
|
|
108
|
+
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
109
|
+
storage layer (see `MinHash LSH at Scale`_).
|
|
110
|
+
|
|
111
|
+
Install
|
|
112
|
+
-------
|
|
113
|
+
|
|
114
|
+
To install datasketch using ``pip``:
|
|
115
|
+
|
|
116
|
+
.. code-block:: bash
|
|
117
|
+
|
|
118
|
+
pip install datasketch
|
|
119
|
+
|
|
120
|
+
This will also install NumPy as dependency.
|
|
121
|
+
|
|
122
|
+
To install with Redis dependency:
|
|
123
|
+
|
|
124
|
+
.. code-block:: bash
|
|
125
|
+
|
|
126
|
+
pip install datasketch[redis]
|
|
127
|
+
|
|
128
|
+
To install with Cassandra dependency:
|
|
129
|
+
|
|
130
|
+
.. code-block:: bash
|
|
131
|
+
|
|
132
|
+
pip install datasketch[cassandra]
|
|
133
|
+
|
|
134
|
+
To install with Bloom filter dependency:
|
|
135
|
+
|
|
136
|
+
.. code-block:: bash
|
|
137
|
+
|
|
138
|
+
pip install datasketch[bloom]
|
|
139
|
+
|
|
140
|
+
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
141
|
+
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
142
|
+
.. _`HyperLogLog`: https://ekzhu.github.io/datasketch/hyperloglog.html
|
|
143
|
+
.. _`HyperLogLog++`: https://ekzhu.github.io/datasketch/hyperloglog.html#hyperloglog-plusplus
|
|
144
|
+
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
145
|
+
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
146
|
+
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
147
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
148
|
+
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
149
|
+
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
150
|
+
|
|
151
|
+
Contributing
|
|
152
|
+
------------
|
|
153
|
+
|
|
154
|
+
We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
|
|
155
|
+
|
|
156
|
+
Development Setup
|
|
157
|
+
^^^^^^^^^^^^^^^^^
|
|
158
|
+
|
|
159
|
+
The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
|
|
160
|
+
|
|
161
|
+
1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
|
|
162
|
+
|
|
163
|
+
2. **Clone the repository**:
|
|
164
|
+
|
|
165
|
+
.. code-block:: bash
|
|
166
|
+
|
|
167
|
+
git clone https://github.com/ekzhu/datasketch.git
|
|
168
|
+
cd datasketch
|
|
169
|
+
|
|
170
|
+
3. **Set up the environment**:
|
|
171
|
+
|
|
172
|
+
.. code-block:: bash
|
|
173
|
+
|
|
174
|
+
# Create a virtual environment
|
|
175
|
+
# (Optional: specify Python version with --python 3.x)
|
|
176
|
+
uv venv
|
|
177
|
+
# Activate the virtual environment (optional, uv run commands work without it)
|
|
178
|
+
source .venv/bin/activate
|
|
179
|
+
|
|
180
|
+
# Install all dependencies
|
|
181
|
+
uv sync
|
|
182
|
+
|
|
183
|
+
4. **Verify installation**:
|
|
184
|
+
|
|
185
|
+
.. code-block:: bash
|
|
186
|
+
|
|
187
|
+
# Run tests to ensure everything works
|
|
188
|
+
uv run pytest
|
|
189
|
+
|
|
190
|
+
5. **Optional dependencies** (for specific development needs):
|
|
191
|
+
|
|
192
|
+
.. code-block:: bash
|
|
193
|
+
|
|
194
|
+
# For testing
|
|
195
|
+
uv sync --extra test
|
|
196
|
+
|
|
197
|
+
# For Cassandra support
|
|
198
|
+
uv sync --extra cassandra
|
|
199
|
+
|
|
200
|
+
# For Redis support
|
|
201
|
+
uv sync --extra redis
|
|
202
|
+
|
|
203
|
+
# For all extras
|
|
204
|
+
uv sync --all-extras
|
|
205
|
+
|
|
206
|
+
Learn more about `uv` at https://docs.astral.sh/uv/
|
|
207
|
+
|
|
208
|
+
Development Workflow
|
|
209
|
+
^^^^^^^^^^^^^^^^^^^^
|
|
210
|
+
|
|
211
|
+
1. **Fork the repository** on GitHub if you haven't already.
|
|
212
|
+
|
|
213
|
+
2. **Create a feature branch** for your changes:
|
|
214
|
+
|
|
215
|
+
.. code-block:: bash
|
|
216
|
+
|
|
217
|
+
git checkout -b feature/your-feature-name
|
|
218
|
+
# Or for bug fixes:
|
|
219
|
+
git checkout -b fix/issue-description
|
|
220
|
+
|
|
221
|
+
3. **Make your changes** following the project's coding standards.
|
|
222
|
+
|
|
223
|
+
4. **Run the tests** to ensure nothing is broken:
|
|
224
|
+
|
|
225
|
+
.. code-block:: bash
|
|
226
|
+
|
|
227
|
+
uv run pytest
|
|
228
|
+
|
|
229
|
+
5. **Check code quality** with ruff:
|
|
230
|
+
|
|
231
|
+
.. code-block:: bash
|
|
232
|
+
|
|
233
|
+
# Check for issues
|
|
234
|
+
uvx ruff check .
|
|
235
|
+
|
|
236
|
+
# Auto-fix formatting issues
|
|
237
|
+
uvx ruff format .
|
|
238
|
+
|
|
239
|
+
6. **Commit your changes** with a clear, descriptive commit message:
|
|
240
|
+
|
|
241
|
+
.. code-block:: bash
|
|
242
|
+
|
|
243
|
+
git commit -m "Add feature: brief description of what was changed"
|
|
244
|
+
|
|
245
|
+
7. **Push to your fork** and create a pull request on GitHub:
|
|
246
|
+
|
|
247
|
+
.. code-block:: bash
|
|
248
|
+
|
|
249
|
+
git push origin your-branch-name
|
|
250
|
+
|
|
251
|
+
8. **Respond to feedback** from maintainers and iterate on your changes.
|
|
252
|
+
|
|
253
|
+
Guidelines
|
|
254
|
+
^^^^^^^^^^
|
|
255
|
+
|
|
256
|
+
- Follow PEP 8 style guidelines
|
|
257
|
+
- Write tests for new features
|
|
258
|
+
- Update documentation as needed
|
|
259
|
+
- Keep commits focused and atomic
|
|
260
|
+
- Be respectful in discussions
|
|
261
|
+
|
|
262
|
+
For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
|
|
@@ -1,68 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: datasketch
|
|
3
|
-
Version: 1.7.0
|
|
4
|
-
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
|
-
Home-page: https://ekzhu.github.io/datasketch
|
|
6
|
-
Author: ekzhu
|
|
7
|
-
Author-email: ekzhu@cs.toronto.edu
|
|
8
|
-
License: MIT
|
|
9
|
-
Project-URL: Source, https://github.com/ekzhu/datasketch
|
|
10
|
-
Keywords: database datamining
|
|
11
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
-
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Topic :: Database
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Requires-Dist: numpy>=1.11
|
|
23
|
-
Requires-Dist: scipy>=1.0.0
|
|
24
|
-
Provides-Extra: cassandra
|
|
25
|
-
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
|
-
Provides-Extra: redis
|
|
27
|
-
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
-
Provides-Extra: bloom
|
|
29
|
-
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
-
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
31
|
-
Provides-Extra: benchmark
|
|
32
|
-
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
33
|
-
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
34
|
-
Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
|
|
35
|
-
Requires-Dist: scipy>=1.3.3; extra == "benchmark"
|
|
36
|
-
Requires-Dist: pandas>=0.25.3; extra == "benchmark"
|
|
37
|
-
Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
|
|
38
|
-
Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
|
|
39
|
-
Requires-Dist: nltk>=3.4.5; extra == "benchmark"
|
|
40
|
-
Provides-Extra: test
|
|
41
|
-
Requires-Dist: cassandra-driver>=3.20; extra == "test"
|
|
42
|
-
Requires-Dist: redis>=2.10.0; extra == "test"
|
|
43
|
-
Requires-Dist: mock>=2.0.0; extra == "test"
|
|
44
|
-
Requires-Dist: mockredispy; extra == "test"
|
|
45
|
-
Requires-Dist: coverage; extra == "test"
|
|
46
|
-
Requires-Dist: pymongo>=3.9.0; extra == "test"
|
|
47
|
-
Requires-Dist: nose>=1.3.7; extra == "test"
|
|
48
|
-
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
49
|
-
Requires-Dist: pytest; extra == "test"
|
|
50
|
-
Provides-Extra: experimental-aio
|
|
51
|
-
Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
|
|
52
|
-
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
-
Dynamic: author
|
|
54
|
-
Dynamic: author-email
|
|
55
|
-
Dynamic: classifier
|
|
56
|
-
Dynamic: description
|
|
57
|
-
Dynamic: home-page
|
|
58
|
-
Dynamic: keywords
|
|
59
|
-
Dynamic: license
|
|
60
|
-
Dynamic: license-file
|
|
61
|
-
Dynamic: project-url
|
|
62
|
-
Dynamic: provides-extra
|
|
63
|
-
Dynamic: requires-dist
|
|
64
|
-
Dynamic: summary
|
|
65
|
-
|
|
66
1
|
datasketch: Big Data Looks Small
|
|
67
2
|
================================
|
|
68
3
|
|
|
@@ -72,6 +7,9 @@ datasketch: Big Data Looks Small
|
|
|
72
7
|
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
73
8
|
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
74
9
|
|
|
10
|
+
.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
|
|
11
|
+
:target: https://codecov.io/gh/ekzhu/datasketch
|
|
12
|
+
|
|
75
13
|
datasketch gives you probabilistic data structures that can process and
|
|
76
14
|
search very large amount of data super fast, with little loss of
|
|
77
15
|
accuracy.
|
|
@@ -107,7 +45,7 @@ sub-linear query time:
|
|
|
107
45
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
108
46
|
+---------------------------+-----------------------------+------------------------+
|
|
109
47
|
|
|
110
|
-
datasketch must be used with Python 3.
|
|
48
|
+
datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
|
|
111
49
|
|
|
112
50
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
113
51
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -117,7 +55,7 @@ Install
|
|
|
117
55
|
|
|
118
56
|
To install datasketch using ``pip``:
|
|
119
57
|
|
|
120
|
-
::
|
|
58
|
+
.. code-block:: bash
|
|
121
59
|
|
|
122
60
|
pip install datasketch
|
|
123
61
|
|
|
@@ -125,19 +63,19 @@ This will also install NumPy as dependency.
|
|
|
125
63
|
|
|
126
64
|
To install with Redis dependency:
|
|
127
65
|
|
|
128
|
-
::
|
|
66
|
+
.. code-block:: bash
|
|
129
67
|
|
|
130
68
|
pip install datasketch[redis]
|
|
131
69
|
|
|
132
70
|
To install with Cassandra dependency:
|
|
133
71
|
|
|
134
|
-
::
|
|
72
|
+
.. code-block:: bash
|
|
135
73
|
|
|
136
74
|
pip install datasketch[cassandra]
|
|
137
75
|
|
|
138
76
|
To install with Bloom filter dependency:
|
|
139
77
|
|
|
140
|
-
::
|
|
78
|
+
.. code-block:: bash
|
|
141
79
|
|
|
142
80
|
pip install datasketch[bloom]
|
|
143
81
|
|
|
@@ -151,3 +89,116 @@ To install with Bloom filter dependency:
|
|
|
151
89
|
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
152
90
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
153
91
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
92
|
+
|
|
93
|
+
Contributing
|
|
94
|
+
------------
|
|
95
|
+
|
|
96
|
+
We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
|
|
97
|
+
|
|
98
|
+
Development Setup
|
|
99
|
+
^^^^^^^^^^^^^^^^^
|
|
100
|
+
|
|
101
|
+
The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
|
|
102
|
+
|
|
103
|
+
1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
|
|
104
|
+
|
|
105
|
+
2. **Clone the repository**:
|
|
106
|
+
|
|
107
|
+
.. code-block:: bash
|
|
108
|
+
|
|
109
|
+
git clone https://github.com/ekzhu/datasketch.git
|
|
110
|
+
cd datasketch
|
|
111
|
+
|
|
112
|
+
3. **Set up the environment**:
|
|
113
|
+
|
|
114
|
+
.. code-block:: bash
|
|
115
|
+
|
|
116
|
+
# Create a virtual environment
|
|
117
|
+
# (Optional: specify Python version with --python 3.x)
|
|
118
|
+
uv venv
|
|
119
|
+
# Activate the virtual environment (optional, uv run commands work without it)
|
|
120
|
+
source .venv/bin/activate
|
|
121
|
+
|
|
122
|
+
# Install all dependencies
|
|
123
|
+
uv sync
|
|
124
|
+
|
|
125
|
+
4. **Verify installation**:
|
|
126
|
+
|
|
127
|
+
.. code-block:: bash
|
|
128
|
+
|
|
129
|
+
# Run tests to ensure everything works
|
|
130
|
+
uv run pytest
|
|
131
|
+
|
|
132
|
+
5. **Optional dependencies** (for specific development needs):
|
|
133
|
+
|
|
134
|
+
.. code-block:: bash
|
|
135
|
+
|
|
136
|
+
# For testing
|
|
137
|
+
uv sync --extra test
|
|
138
|
+
|
|
139
|
+
# For Cassandra support
|
|
140
|
+
uv sync --extra cassandra
|
|
141
|
+
|
|
142
|
+
# For Redis support
|
|
143
|
+
uv sync --extra redis
|
|
144
|
+
|
|
145
|
+
# For all extras
|
|
146
|
+
uv sync --all-extras
|
|
147
|
+
|
|
148
|
+
Learn more about `uv` at https://docs.astral.sh/uv/
|
|
149
|
+
|
|
150
|
+
Development Workflow
|
|
151
|
+
^^^^^^^^^^^^^^^^^^^^
|
|
152
|
+
|
|
153
|
+
1. **Fork the repository** on GitHub if you haven't already.
|
|
154
|
+
|
|
155
|
+
2. **Create a feature branch** for your changes:
|
|
156
|
+
|
|
157
|
+
.. code-block:: bash
|
|
158
|
+
|
|
159
|
+
git checkout -b feature/your-feature-name
|
|
160
|
+
# Or for bug fixes:
|
|
161
|
+
git checkout -b fix/issue-description
|
|
162
|
+
|
|
163
|
+
3. **Make your changes** following the project's coding standards.
|
|
164
|
+
|
|
165
|
+
4. **Run the tests** to ensure nothing is broken:
|
|
166
|
+
|
|
167
|
+
.. code-block:: bash
|
|
168
|
+
|
|
169
|
+
uv run pytest
|
|
170
|
+
|
|
171
|
+
5. **Check code quality** with ruff:
|
|
172
|
+
|
|
173
|
+
.. code-block:: bash
|
|
174
|
+
|
|
175
|
+
# Check for issues
|
|
176
|
+
uvx ruff check .
|
|
177
|
+
|
|
178
|
+
# Auto-fix formatting issues
|
|
179
|
+
uvx ruff format .
|
|
180
|
+
|
|
181
|
+
6. **Commit your changes** with a clear, descriptive commit message:
|
|
182
|
+
|
|
183
|
+
.. code-block:: bash
|
|
184
|
+
|
|
185
|
+
git commit -m "Add feature: brief description of what was changed"
|
|
186
|
+
|
|
187
|
+
7. **Push to your fork** and create a pull request on GitHub:
|
|
188
|
+
|
|
189
|
+
.. code-block:: bash
|
|
190
|
+
|
|
191
|
+
git push origin your-branch-name
|
|
192
|
+
|
|
193
|
+
8. **Respond to feedback** from maintainers and iterate on your changes.
|
|
194
|
+
|
|
195
|
+
Guidelines
|
|
196
|
+
^^^^^^^^^^
|
|
197
|
+
|
|
198
|
+
- Follow PEP 8 style guidelines
|
|
199
|
+
- Write tests for new features
|
|
200
|
+
- Update documentation as needed
|
|
201
|
+
- Keep commits focused and atomic
|
|
202
|
+
- Be respectful in discussions
|
|
203
|
+
|
|
204
|
+
For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
|
|
@@ -1,18 +1,43 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
from typing import Final
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
_version = importlib.metadata.version(__name__)
|
|
6
|
+
except importlib.metadata.PackageNotFoundError:
|
|
7
|
+
_version = "0.0.0" # Fallback for development mode
|
|
8
|
+
__version__: Final[str] = _version
|
|
9
|
+
|
|
3
10
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
11
|
+
from datasketch.hashfunc import sha1_hash32
|
|
12
|
+
from datasketch.hnsw import HNSW
|
|
13
|
+
from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
14
|
+
from datasketch.lean_minhash import LeanMinHash
|
|
4
15
|
from datasketch.lsh import MinHashLSH
|
|
5
16
|
from datasketch.lsh_bloom import MinHashLSHBloom
|
|
6
|
-
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
7
|
-
from datasketch.lshforest import MinHashLSHForest
|
|
8
17
|
from datasketch.lshensemble import MinHashLSHEnsemble
|
|
9
|
-
from datasketch.
|
|
10
|
-
from datasketch.
|
|
11
|
-
from datasketch.
|
|
18
|
+
from datasketch.lshforest import MinHashLSHForest
|
|
19
|
+
from datasketch.minhash import MinHash
|
|
20
|
+
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
12
21
|
|
|
13
22
|
# Alias
|
|
14
23
|
WeightedMinHashLSH = MinHashLSH
|
|
15
24
|
WeightedMinHashLSHForest = MinHashLSHForest
|
|
16
25
|
|
|
17
|
-
|
|
18
|
-
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"HNSW",
|
|
29
|
+
"HyperLogLog",
|
|
30
|
+
"HyperLogLogPlusPlus",
|
|
31
|
+
"LeanMinHash",
|
|
32
|
+
"MinHash",
|
|
33
|
+
"MinHashLSH",
|
|
34
|
+
"MinHashLSHBloom",
|
|
35
|
+
"MinHashLSHEnsemble",
|
|
36
|
+
"MinHashLSHForest",
|
|
37
|
+
"WeightedMinHash",
|
|
38
|
+
"WeightedMinHashGenerator",
|
|
39
|
+
"WeightedMinHashLSH",
|
|
40
|
+
"WeightedMinHashLSHForest",
|
|
41
|
+
"bBitMinHash",
|
|
42
|
+
"sha1_hash32",
|
|
43
|
+
]
|