datasketch 1.6.5__tar.gz → 1.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasketch-1.8.0/.gitignore +87 -0
- datasketch-1.8.0/PKG-INFO +256 -0
- datasketch-1.6.5/PKG-INFO → datasketch-1.8.0/README.rst +125 -54
- datasketch-1.8.0/datasketch/__init__.py +43 -0
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/b_bit_minhash.py +59 -67
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/__init__.py +4 -7
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/aio/lsh.py +101 -117
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/aio/storage.py +105 -107
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hashfunc.py +6 -3
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hnsw.py +103 -151
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hyperloglog.py +55 -66
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lean_minhash.py +25 -25
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lsh.py +189 -132
- datasketch-1.8.0/datasketch/lsh_bloom.py +377 -0
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lshensemble.py +37 -45
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lshensemble_partition.py +34 -33
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/lshforest.py +28 -36
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/minhash.py +171 -47
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/storage.py +175 -228
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/weighted_minhash.py +23 -25
- datasketch-1.8.0/pyproject.toml +162 -0
- datasketch-1.6.5/README.rst +0 -80
- datasketch-1.6.5/datasketch/__init__.py +0 -17
- datasketch-1.6.5/datasketch/version.py +0 -1
- datasketch-1.6.5/datasketch.egg-info/PKG-INFO +0 -130
- datasketch-1.6.5/datasketch.egg-info/SOURCES.txt +0 -36
- datasketch-1.6.5/datasketch.egg-info/dependency_links.txt +0 -1
- datasketch-1.6.5/datasketch.egg-info/requires.txt +0 -35
- datasketch-1.6.5/datasketch.egg-info/top_level.txt +0 -1
- datasketch-1.6.5/setup.cfg +0 -4
- datasketch-1.6.5/setup.py +0 -87
- datasketch-1.6.5/test/test_hnsw.py +0 -306
- datasketch-1.6.5/test/test_hyperloglog.py +0 -177
- datasketch-1.6.5/test/test_lean_minhash.py +0 -190
- datasketch-1.6.5/test/test_lsh.py +0 -451
- datasketch-1.6.5/test/test_lsh_cassandra.py +0 -269
- datasketch-1.6.5/test/test_lshensemble.py +0 -92
- datasketch-1.6.5/test/test_lshforest.py +0 -149
- datasketch-1.6.5/test/test_minhash.py +0 -203
- datasketch-1.6.5/test/test_weighted_minhash.py +0 -108
- {datasketch-1.6.5 → datasketch-1.8.0}/LICENSE +0 -0
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.6.5 → datasketch-1.8.0}/datasketch/hyperloglog_const.py +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
|
|
5
|
+
# C extensions
|
|
6
|
+
*.so
|
|
7
|
+
|
|
8
|
+
# VIM stuff
|
|
9
|
+
*.swp
|
|
10
|
+
|
|
11
|
+
# Distribution / packaging
|
|
12
|
+
.Python
|
|
13
|
+
env/
|
|
14
|
+
build/
|
|
15
|
+
develop-eggs/
|
|
16
|
+
dist/
|
|
17
|
+
downloads/
|
|
18
|
+
eggs/
|
|
19
|
+
.eggs/
|
|
20
|
+
lib/
|
|
21
|
+
lib64/
|
|
22
|
+
parts/
|
|
23
|
+
sdist/
|
|
24
|
+
var/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*,cover
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Django stuff:
|
|
54
|
+
*.log
|
|
55
|
+
|
|
56
|
+
# Sphinx documentation
|
|
57
|
+
docs/_build/
|
|
58
|
+
|
|
59
|
+
# PyBuilder
|
|
60
|
+
target/
|
|
61
|
+
|
|
62
|
+
# Documentation
|
|
63
|
+
.doctrees
|
|
64
|
+
_build
|
|
65
|
+
doctrees
|
|
66
|
+
|
|
67
|
+
# Jetbrains
|
|
68
|
+
.idea
|
|
69
|
+
|
|
70
|
+
# Benchmark files
|
|
71
|
+
benchmark/**/*.inp.gz
|
|
72
|
+
|
|
73
|
+
# Benchmark output
|
|
74
|
+
benchmark/**/*.sqlite
|
|
75
|
+
|
|
76
|
+
# Benchmark plots
|
|
77
|
+
benchmark/**/*.png
|
|
78
|
+
benchmark/**/*.pdf
|
|
79
|
+
|
|
80
|
+
# Virtual env
|
|
81
|
+
.venv
|
|
82
|
+
|
|
83
|
+
# IDE
|
|
84
|
+
.vscode
|
|
85
|
+
|
|
86
|
+
# MacOS
|
|
87
|
+
.DS_Store
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datasketch
|
|
3
|
+
Version: 1.8.0
|
|
4
|
+
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
|
+
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
7
|
+
Project-URL: Documentation, https://ekzhu.github.io/datasketch
|
|
8
|
+
Project-URL: Source, https://github.com/ekzhu/datasketch
|
|
9
|
+
Author-email: ekzhu <ekzhu@cs.toronto.edu>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: database,datamining
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Requires-Dist: numpy>=1.11
|
|
24
|
+
Requires-Dist: scipy>=1.0.0
|
|
25
|
+
Provides-Extra: benchmark
|
|
26
|
+
Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
|
|
27
|
+
Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
|
|
28
|
+
Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
|
|
29
|
+
Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
|
|
30
|
+
Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
|
|
31
|
+
Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
|
|
32
|
+
Requires-Dist: scipy>=1.3.3; extra == 'benchmark'
|
|
33
|
+
Requires-Dist: setsimilaritysearch>=0.1.7; extra == 'benchmark'
|
|
34
|
+
Provides-Extra: bloom
|
|
35
|
+
Requires-Dist: pybloomfilter3>=0.7.2; extra == 'bloom'
|
|
36
|
+
Provides-Extra: cassandra
|
|
37
|
+
Requires-Dist: cassandra-driver>=3.20; extra == 'cassandra'
|
|
38
|
+
Provides-Extra: experimental-aio
|
|
39
|
+
Requires-Dist: aiounittest; extra == 'experimental-aio'
|
|
40
|
+
Requires-Dist: motor>3.6.0; extra == 'experimental-aio'
|
|
41
|
+
Provides-Extra: redis
|
|
42
|
+
Requires-Dist: redis>=2.10.0; extra == 'redis'
|
|
43
|
+
Provides-Extra: test
|
|
44
|
+
Requires-Dist: cassandra-driver>=3.20; extra == 'test'
|
|
45
|
+
Requires-Dist: coverage; extra == 'test'
|
|
46
|
+
Requires-Dist: mock>=2.0.0; extra == 'test'
|
|
47
|
+
Requires-Dist: mockredispy; extra == 'test'
|
|
48
|
+
Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
49
|
+
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
50
|
+
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
51
|
+
Requires-Dist: pytest; extra == 'test'
|
|
52
|
+
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
53
|
+
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
54
|
+
Description-Content-Type: text/x-rst
|
|
55
|
+
|
|
56
|
+
datasketch: Big Data Looks Small
|
|
57
|
+
================================
|
|
58
|
+
|
|
59
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
60
|
+
:target: https://pepy.tech/project/datasketch
|
|
61
|
+
|
|
62
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
63
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
64
|
+
|
|
65
|
+
datasketch gives you probabilistic data structures that can process and
|
|
66
|
+
search very large amount of data super fast, with little loss of
|
|
67
|
+
accuracy.
|
|
68
|
+
|
|
69
|
+
This package contains the following data sketches:
|
|
70
|
+
|
|
71
|
+
+-------------------------+-----------------------------------------------+
|
|
72
|
+
| Data Sketch | Usage |
|
|
73
|
+
+=========================+===============================================+
|
|
74
|
+
| `MinHash`_ | estimate Jaccard similarity and cardinality |
|
|
75
|
+
+-------------------------+-----------------------------------------------+
|
|
76
|
+
| `Weighted MinHash`_ | estimate weighted Jaccard similarity |
|
|
77
|
+
+-------------------------+-----------------------------------------------+
|
|
78
|
+
| `HyperLogLog`_ | estimate cardinality |
|
|
79
|
+
+-------------------------+-----------------------------------------------+
|
|
80
|
+
| `HyperLogLog++`_ | estimate cardinality |
|
|
81
|
+
+-------------------------+-----------------------------------------------+
|
|
82
|
+
|
|
83
|
+
The following indexes for data sketches are provided to support
|
|
84
|
+
sub-linear query time:
|
|
85
|
+
|
|
86
|
+
+---------------------------+-----------------------------+------------------------+
|
|
87
|
+
| Index | For Data Sketch | Supported Query Type |
|
|
88
|
+
+===========================+=============================+========================+
|
|
89
|
+
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
90
|
+
+---------------------------+-----------------------------+------------------------+
|
|
91
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
92
|
+
+---------------------------+-----------------------------+------------------------+
|
|
93
|
+
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
94
|
+
+---------------------------+-----------------------------+------------------------+
|
|
95
|
+
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
96
|
+
+---------------------------+-----------------------------+------------------------+
|
|
97
|
+
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
98
|
+
+---------------------------+-----------------------------+------------------------+
|
|
99
|
+
|
|
100
|
+
datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
|
|
101
|
+
|
|
102
|
+
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
103
|
+
storage layer (see `MinHash LSH at Scale`_).
|
|
104
|
+
|
|
105
|
+
Install
|
|
106
|
+
-------
|
|
107
|
+
|
|
108
|
+
To install datasketch using ``pip``:
|
|
109
|
+
|
|
110
|
+
.. code-block:: bash
|
|
111
|
+
|
|
112
|
+
pip install datasketch
|
|
113
|
+
|
|
114
|
+
This will also install NumPy as dependency.
|
|
115
|
+
|
|
116
|
+
To install with Redis dependency:
|
|
117
|
+
|
|
118
|
+
.. code-block:: bash
|
|
119
|
+
|
|
120
|
+
pip install datasketch[redis]
|
|
121
|
+
|
|
122
|
+
To install with Cassandra dependency:
|
|
123
|
+
|
|
124
|
+
.. code-block:: bash
|
|
125
|
+
|
|
126
|
+
pip install datasketch[cassandra]
|
|
127
|
+
|
|
128
|
+
To install with Bloom filter dependency:
|
|
129
|
+
|
|
130
|
+
.. code-block:: bash
|
|
131
|
+
|
|
132
|
+
pip install datasketch[bloom]
|
|
133
|
+
|
|
134
|
+
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
135
|
+
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
136
|
+
.. _`HyperLogLog`: https://ekzhu.github.io/datasketch/hyperloglog.html
|
|
137
|
+
.. _`HyperLogLog++`: https://ekzhu.github.io/datasketch/hyperloglog.html#hyperloglog-plusplus
|
|
138
|
+
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
139
|
+
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
140
|
+
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
141
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
142
|
+
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
143
|
+
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
144
|
+
|
|
145
|
+
Contributing
|
|
146
|
+
------------
|
|
147
|
+
|
|
148
|
+
We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
|
|
149
|
+
|
|
150
|
+
Development Setup
|
|
151
|
+
^^^^^^^^^^^^^^^^^
|
|
152
|
+
|
|
153
|
+
The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
|
|
154
|
+
|
|
155
|
+
1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
|
|
156
|
+
|
|
157
|
+
2. **Clone the repository**:
|
|
158
|
+
|
|
159
|
+
.. code-block:: bash
|
|
160
|
+
|
|
161
|
+
git clone https://github.com/ekzhu/datasketch.git
|
|
162
|
+
cd datasketch
|
|
163
|
+
|
|
164
|
+
3. **Set up the environment**:
|
|
165
|
+
|
|
166
|
+
.. code-block:: bash
|
|
167
|
+
|
|
168
|
+
# Create a virtual environment
|
|
169
|
+
# (Optional: specify Python version with --python 3.x)
|
|
170
|
+
uv venv
|
|
171
|
+
# Activate the virtual environment (optional, uv run commands work without it)
|
|
172
|
+
source .venv/bin/activate
|
|
173
|
+
|
|
174
|
+
# Install all dependencies
|
|
175
|
+
uv sync
|
|
176
|
+
|
|
177
|
+
4. **Verify installation**:
|
|
178
|
+
|
|
179
|
+
.. code-block:: bash
|
|
180
|
+
|
|
181
|
+
# Run tests to ensure everything works
|
|
182
|
+
uv run pytest
|
|
183
|
+
|
|
184
|
+
5. **Optional dependencies** (for specific development needs):
|
|
185
|
+
|
|
186
|
+
.. code-block:: bash
|
|
187
|
+
|
|
188
|
+
# For testing
|
|
189
|
+
uv sync --extra test
|
|
190
|
+
|
|
191
|
+
# For Cassandra support
|
|
192
|
+
uv sync --extra cassandra
|
|
193
|
+
|
|
194
|
+
# For Redis support
|
|
195
|
+
uv sync --extra redis
|
|
196
|
+
|
|
197
|
+
# For all extras
|
|
198
|
+
uv sync --all-extras
|
|
199
|
+
|
|
200
|
+
Learn more about `uv` at https://docs.astral.sh/uv/
|
|
201
|
+
|
|
202
|
+
Development Workflow
|
|
203
|
+
^^^^^^^^^^^^^^^^^^^^
|
|
204
|
+
|
|
205
|
+
1. **Fork the repository** on GitHub if you haven't already.
|
|
206
|
+
|
|
207
|
+
2. **Create a feature branch** for your changes:
|
|
208
|
+
|
|
209
|
+
.. code-block:: bash
|
|
210
|
+
|
|
211
|
+
git checkout -b feature/your-feature-name
|
|
212
|
+
# Or for bug fixes:
|
|
213
|
+
git checkout -b fix/issue-description
|
|
214
|
+
|
|
215
|
+
3. **Make your changes** following the project's coding standards.
|
|
216
|
+
|
|
217
|
+
4. **Run the tests** to ensure nothing is broken:
|
|
218
|
+
|
|
219
|
+
.. code-block:: bash
|
|
220
|
+
|
|
221
|
+
uv run pytest
|
|
222
|
+
|
|
223
|
+
5. **Check code quality** with ruff:
|
|
224
|
+
|
|
225
|
+
.. code-block:: bash
|
|
226
|
+
|
|
227
|
+
# Check for issues
|
|
228
|
+
uvx ruff check .
|
|
229
|
+
|
|
230
|
+
# Auto-fix formatting issues
|
|
231
|
+
uvx ruff format .
|
|
232
|
+
|
|
233
|
+
6. **Commit your changes** with a clear, descriptive commit message:
|
|
234
|
+
|
|
235
|
+
.. code-block:: bash
|
|
236
|
+
|
|
237
|
+
git commit -m "Add feature: brief description of what was changed"
|
|
238
|
+
|
|
239
|
+
7. **Push to your fork** and create a pull request on GitHub:
|
|
240
|
+
|
|
241
|
+
.. code-block:: bash
|
|
242
|
+
|
|
243
|
+
git push origin your-branch-name
|
|
244
|
+
|
|
245
|
+
8. **Respond to feedback** from maintainers and iterate on your changes.
|
|
246
|
+
|
|
247
|
+
Guidelines
|
|
248
|
+
^^^^^^^^^^
|
|
249
|
+
|
|
250
|
+
- Follow PEP 8 style guidelines
|
|
251
|
+
- Write tests for new features
|
|
252
|
+
- Update documentation as needed
|
|
253
|
+
- Keep commits focused and atomic
|
|
254
|
+
- Be respectful in discussions
|
|
255
|
+
|
|
256
|
+
For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
|
|
@@ -1,53 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: datasketch
|
|
3
|
-
Version: 1.6.5
|
|
4
|
-
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
|
-
Home-page: https://ekzhu.github.io/datasketch
|
|
6
|
-
Author: ekzhu
|
|
7
|
-
Author-email: ekzhu@cs.toronto.edu
|
|
8
|
-
License: MIT
|
|
9
|
-
Project-URL: Source, https://github.com/ekzhu/datasketch
|
|
10
|
-
Keywords: database datamining
|
|
11
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
-
Classifier: Intended Audience :: Developers
|
|
13
|
-
Classifier: Topic :: Database
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Requires-Dist: numpy>=1.11
|
|
23
|
-
Requires-Dist: scipy>=1.0.0
|
|
24
|
-
Provides-Extra: cassandra
|
|
25
|
-
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
|
-
Provides-Extra: redis
|
|
27
|
-
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
-
Provides-Extra: benchmark
|
|
29
|
-
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
30
|
-
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
31
|
-
Requires-Dist: scikit-learn>=0.21.3; extra == "benchmark"
|
|
32
|
-
Requires-Dist: scipy>=1.3.3; extra == "benchmark"
|
|
33
|
-
Requires-Dist: pandas>=0.25.3; extra == "benchmark"
|
|
34
|
-
Requires-Dist: SetSimilaritySearch>=0.1.7; extra == "benchmark"
|
|
35
|
-
Requires-Dist: pyfarmhash>=0.2.2; extra == "benchmark"
|
|
36
|
-
Requires-Dist: nltk>=3.4.5; extra == "benchmark"
|
|
37
|
-
Provides-Extra: test
|
|
38
|
-
Requires-Dist: cassandra-driver>=3.20; extra == "test"
|
|
39
|
-
Requires-Dist: redis>=2.10.0; extra == "test"
|
|
40
|
-
Requires-Dist: mock>=2.0.0; extra == "test"
|
|
41
|
-
Requires-Dist: mockredispy; extra == "test"
|
|
42
|
-
Requires-Dist: coverage; extra == "test"
|
|
43
|
-
Requires-Dist: pymongo>=3.9.0; extra == "test"
|
|
44
|
-
Requires-Dist: nose>=1.3.7; extra == "test"
|
|
45
|
-
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
46
|
-
Requires-Dist: pytest; extra == "test"
|
|
47
|
-
Provides-Extra: experimental-aio
|
|
48
|
-
Requires-Dist: aiounittest; python_version >= "3.6" and extra == "experimental-aio"
|
|
49
|
-
Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
|
|
50
|
-
|
|
51
1
|
datasketch: Big Data Looks Small
|
|
52
2
|
================================
|
|
53
3
|
|
|
@@ -83,6 +33,8 @@ sub-linear query time:
|
|
|
83
33
|
+===========================+=============================+========================+
|
|
84
34
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
85
35
|
+---------------------------+-----------------------------+------------------------+
|
|
36
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
37
|
+
+---------------------------+-----------------------------+------------------------+
|
|
86
38
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
87
39
|
+---------------------------+-----------------------------+------------------------+
|
|
88
40
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -90,7 +42,7 @@ sub-linear query time:
|
|
|
90
42
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
91
43
|
+---------------------------+-----------------------------+------------------------+
|
|
92
44
|
|
|
93
|
-
datasketch must be used with Python 3.
|
|
45
|
+
datasketch must be used with Python 3.9 or above, NumPy 1.11 or above, and Scipy.
|
|
94
46
|
|
|
95
47
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
96
48
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -100,7 +52,7 @@ Install
|
|
|
100
52
|
|
|
101
53
|
To install datasketch using ``pip``:
|
|
102
54
|
|
|
103
|
-
::
|
|
55
|
+
.. code-block:: bash
|
|
104
56
|
|
|
105
57
|
pip install datasketch
|
|
106
58
|
|
|
@@ -108,16 +60,21 @@ This will also install NumPy as dependency.
|
|
|
108
60
|
|
|
109
61
|
To install with Redis dependency:
|
|
110
62
|
|
|
111
|
-
::
|
|
63
|
+
.. code-block:: bash
|
|
112
64
|
|
|
113
65
|
pip install datasketch[redis]
|
|
114
66
|
|
|
115
67
|
To install with Cassandra dependency:
|
|
116
68
|
|
|
117
|
-
::
|
|
69
|
+
.. code-block:: bash
|
|
118
70
|
|
|
119
71
|
pip install datasketch[cassandra]
|
|
120
72
|
|
|
73
|
+
To install with Bloom filter dependency:
|
|
74
|
+
|
|
75
|
+
.. code-block:: bash
|
|
76
|
+
|
|
77
|
+
pip install datasketch[bloom]
|
|
121
78
|
|
|
122
79
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
123
80
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -126,5 +83,119 @@ To install with Cassandra dependency:
|
|
|
126
83
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
127
84
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
128
85
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
86
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
129
87
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
130
88
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
89
|
+
|
|
90
|
+
Contributing
|
|
91
|
+
------------
|
|
92
|
+
|
|
93
|
+
We welcome contributions from everyone. Whether you're fixing bugs, adding features, improving documentation, or helping with tests, your contributions are valuable.
|
|
94
|
+
|
|
95
|
+
Development Setup
|
|
96
|
+
^^^^^^^^^^^^^^^^^
|
|
97
|
+
|
|
98
|
+
The project uses `uv` for fast and reliable Python package management. Follow these steps to set up your development environment:
|
|
99
|
+
|
|
100
|
+
1. **Install uv**: Follow the official installation guide at https://docs.astral.sh/uv/getting-started/installation/
|
|
101
|
+
|
|
102
|
+
2. **Clone the repository**:
|
|
103
|
+
|
|
104
|
+
.. code-block:: bash
|
|
105
|
+
|
|
106
|
+
git clone https://github.com/ekzhu/datasketch.git
|
|
107
|
+
cd datasketch
|
|
108
|
+
|
|
109
|
+
3. **Set up the environment**:
|
|
110
|
+
|
|
111
|
+
.. code-block:: bash
|
|
112
|
+
|
|
113
|
+
# Create a virtual environment
|
|
114
|
+
# (Optional: specify Python version with --python 3.x)
|
|
115
|
+
uv venv
|
|
116
|
+
# Activate the virtual environment (optional, uv run commands work without it)
|
|
117
|
+
source .venv/bin/activate
|
|
118
|
+
|
|
119
|
+
# Install all dependencies
|
|
120
|
+
uv sync
|
|
121
|
+
|
|
122
|
+
4. **Verify installation**:
|
|
123
|
+
|
|
124
|
+
.. code-block:: bash
|
|
125
|
+
|
|
126
|
+
# Run tests to ensure everything works
|
|
127
|
+
uv run pytest
|
|
128
|
+
|
|
129
|
+
5. **Optional dependencies** (for specific development needs):
|
|
130
|
+
|
|
131
|
+
.. code-block:: bash
|
|
132
|
+
|
|
133
|
+
# For testing
|
|
134
|
+
uv sync --extra test
|
|
135
|
+
|
|
136
|
+
# For Cassandra support
|
|
137
|
+
uv sync --extra cassandra
|
|
138
|
+
|
|
139
|
+
# For Redis support
|
|
140
|
+
uv sync --extra redis
|
|
141
|
+
|
|
142
|
+
# For all extras
|
|
143
|
+
uv sync --all-extras
|
|
144
|
+
|
|
145
|
+
Learn more about `uv` at https://docs.astral.sh/uv/
|
|
146
|
+
|
|
147
|
+
Development Workflow
|
|
148
|
+
^^^^^^^^^^^^^^^^^^^^
|
|
149
|
+
|
|
150
|
+
1. **Fork the repository** on GitHub if you haven't already.
|
|
151
|
+
|
|
152
|
+
2. **Create a feature branch** for your changes:
|
|
153
|
+
|
|
154
|
+
.. code-block:: bash
|
|
155
|
+
|
|
156
|
+
git checkout -b feature/your-feature-name
|
|
157
|
+
# Or for bug fixes:
|
|
158
|
+
git checkout -b fix/issue-description
|
|
159
|
+
|
|
160
|
+
3. **Make your changes** following the project's coding standards.
|
|
161
|
+
|
|
162
|
+
4. **Run the tests** to ensure nothing is broken:
|
|
163
|
+
|
|
164
|
+
.. code-block:: bash
|
|
165
|
+
|
|
166
|
+
uv run pytest
|
|
167
|
+
|
|
168
|
+
5. **Check code quality** with ruff:
|
|
169
|
+
|
|
170
|
+
.. code-block:: bash
|
|
171
|
+
|
|
172
|
+
# Check for issues
|
|
173
|
+
uvx ruff check .
|
|
174
|
+
|
|
175
|
+
# Auto-fix formatting issues
|
|
176
|
+
uvx ruff format .
|
|
177
|
+
|
|
178
|
+
6. **Commit your changes** with a clear, descriptive commit message:
|
|
179
|
+
|
|
180
|
+
.. code-block:: bash
|
|
181
|
+
|
|
182
|
+
git commit -m "Add feature: brief description of what was changed"
|
|
183
|
+
|
|
184
|
+
7. **Push to your fork** and create a pull request on GitHub:
|
|
185
|
+
|
|
186
|
+
.. code-block:: bash
|
|
187
|
+
|
|
188
|
+
git push origin your-branch-name
|
|
189
|
+
|
|
190
|
+
8. **Respond to feedback** from maintainers and iterate on your changes.
|
|
191
|
+
|
|
192
|
+
Guidelines
|
|
193
|
+
^^^^^^^^^^
|
|
194
|
+
|
|
195
|
+
- Follow PEP 8 style guidelines
|
|
196
|
+
- Write tests for new features
|
|
197
|
+
- Update documentation as needed
|
|
198
|
+
- Keep commits focused and atomic
|
|
199
|
+
- Be respectful in discussions
|
|
200
|
+
|
|
201
|
+
For more information, check the `GitHub issues <https://github.com/ekzhu/datasketch/issues>`_ for current priorities or areas needing help. You can also join the discussion on `project roadmap and priorities <https://github.com/ekzhu/datasketch/discussions/252>`_.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
from typing import Final
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
_version = importlib.metadata.version(__name__)
|
|
6
|
+
except importlib.metadata.PackageNotFoundError:
|
|
7
|
+
_version = "0.0.0" # Fallback for development mode
|
|
8
|
+
__version__: Final[str] = _version
|
|
9
|
+
|
|
10
|
+
from datasketch.b_bit_minhash import bBitMinHash
|
|
11
|
+
from datasketch.hashfunc import sha1_hash32
|
|
12
|
+
from datasketch.hnsw import HNSW
|
|
13
|
+
from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
14
|
+
from datasketch.lean_minhash import LeanMinHash
|
|
15
|
+
from datasketch.lsh import MinHashLSH
|
|
16
|
+
from datasketch.lsh_bloom import MinHashLSHBloom
|
|
17
|
+
from datasketch.lshensemble import MinHashLSHEnsemble
|
|
18
|
+
from datasketch.lshforest import MinHashLSHForest
|
|
19
|
+
from datasketch.minhash import MinHash
|
|
20
|
+
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
21
|
+
|
|
22
|
+
# Alias
|
|
23
|
+
WeightedMinHashLSH = MinHashLSH
|
|
24
|
+
WeightedMinHashLSHForest = MinHashLSHForest
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"HNSW",
|
|
29
|
+
"HyperLogLog",
|
|
30
|
+
"HyperLogLogPlusPlus",
|
|
31
|
+
"LeanMinHash",
|
|
32
|
+
"MinHash",
|
|
33
|
+
"MinHashLSH",
|
|
34
|
+
"MinHashLSHBloom",
|
|
35
|
+
"MinHashLSHEnsemble",
|
|
36
|
+
"MinHashLSHForest",
|
|
37
|
+
"WeightedMinHash",
|
|
38
|
+
"WeightedMinHashGenerator",
|
|
39
|
+
"WeightedMinHashLSH",
|
|
40
|
+
"WeightedMinHashLSHForest",
|
|
41
|
+
"bBitMinHash",
|
|
42
|
+
"sha1_hash32",
|
|
43
|
+
]
|