lshrs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lshrs-0.1.0/LICENSE +21 -0
- lshrs-0.1.0/PKG-INFO +309 -0
- lshrs-0.1.0/README.md +286 -0
- lshrs-0.1.0/pyproject.toml +34 -0
- lshrs-0.1.0/src/lshrs/__init__.py +0 -0
- lshrs-0.1.0/src/lshrs/core/__init__.py +33 -0
- lshrs-0.1.0/src/lshrs/core/config.py +90 -0
- lshrs-0.1.0/src/lshrs/core/dataloader.py +196 -0
- lshrs-0.1.0/src/lshrs/core/exceptions.py +66 -0
- lshrs-0.1.0/src/lshrs/core/interfaces.py +131 -0
- lshrs-0.1.0/src/lshrs/core/main.py +202 -0
- lshrs-0.1.0/src/lshrs/encoding/__init__.py +0 -0
- lshrs-0.1.0/src/lshrs/encoding/embedding.py +19 -0
- lshrs-0.1.0/src/lshrs/encoding/main.py +14 -0
- lshrs-0.1.0/src/lshrs/encoding/onehot.py +73 -0
- lshrs-0.1.0/src/lshrs/encoding/tfidf.py +83 -0
- lshrs-0.1.0/src/lshrs/hashing/__init__.py +0 -0
- lshrs-0.1.0/src/lshrs/hashing/hyperplane.py +50 -0
- lshrs-0.1.0/src/lshrs/hashing/lsh.py +42 -0
- lshrs-0.1.0/src/lshrs/hashing/minhash.py +134 -0
- lshrs-0.1.0/src/lshrs/preprocessing/__init__.py +0 -0
- lshrs-0.1.0/src/lshrs/preprocessing/lemmatize.py +44 -0
- lshrs-0.1.0/src/lshrs/preprocessing/shingling.py +18 -0
- lshrs-0.1.0/src/lshrs/preprocessing/stopwords.py +19 -0
- lshrs-0.1.0/src/lshrs/preprocessing/website.py +53 -0
- lshrs-0.1.0/src/lshrs/utils/__init__.py +0 -0
- lshrs-0.1.0/src/lshrs/utils/br.py +54 -0
- lshrs-0.1.0/src/lshrs/utils/helpers.py +5 -0
- lshrs-0.1.0/src/lshrs/utils/save.py +619 -0
- lshrs-0.1.0/src/lshrs/utils/similarity.py +9 -0
lshrs-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Y. Zhao, M. Guan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
lshrs-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: lshrs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Locality Sensitive Hashing based recommendation system for efficient similarity search.
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Y. Zhao
|
|
7
|
+
Author-email: yimingzhao936@gmail.com
|
|
8
|
+
Requires-Python: >3.11
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: build
|
|
15
|
+
Requires-Dist: nltk
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: scikit-learn
|
|
18
|
+
Requires-Dist: scipy
|
|
19
|
+
Requires-Dist: twine
|
|
20
|
+
Project-URL: Homepage, https://github.com/mxngjxa/lshrs
|
|
21
|
+
Project-URL: Repository, https://github.com/mxngjxa/lshrs
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# Locality Sensitive Hashing Recommendation System
|
|
25
|
+
|
|
26
|
+
[](LICENSE)
|
|
27
|
+
[](https://www.python.org/downloads/)
|
|
28
|
+
[](https://pypi.org/project/locality-sensitive-hashing-recommendation-system/)
|
|
29
|
+
[](https://github.com/mxngjxa/lshrs/actions/workflows/lint.yml)
|
|
30
|
+
[](https://github.com/mxngjxa/lshrs/deployments)
|
|
31
|
+
[](https://github.com/astral-sh/ruff)
|
|
32
|
+
[](https://pypi.org/project/lshrs/)
|
|
33
|
+
[](https://GitHub.com/mxngjxa/lshrs/graphs/contributors/)
|
|
34
|
+
[](https://GitHub.com/mxngjxa/lshrs/graphs/commit-activity)
|
|
35
|
+
|
|
36
|
+
A Locality Sensitive Hashing (LSH) based recommendation system for efficient similarity search in Python.
|
|
37
|
+
|
|
38
|
+
## Table of Contents
|
|
39
|
+
|
|
40
|
+
- [Project Structure](#project-structure)
|
|
41
|
+
- [Installation](#installation)
|
|
42
|
+
- [Usage](#usage)
|
|
43
|
+
- [Development](#development)
|
|
44
|
+
- [Architecture](#architecture)
|
|
45
|
+
- [License](#license)
|
|
46
|
+
- [Authors](#authors)
|
|
47
|
+
- [Changelog](#changelog)
|
|
48
|
+
- [Contributing](#contributing)
|
|
49
|
+
|
|
50
|
+
## Project Structure
|
|
51
|
+
|
|
52
|
+
The project is organized as follows:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
.
|
|
56
|
+
├── CHANGELOG.md
|
|
57
|
+
├── docs
|
|
58
|
+
│ ├── api
|
|
59
|
+
│ ├── architecture.md
|
|
60
|
+
│ ├── examples
|
|
61
|
+
│ ├── index.md
|
|
62
|
+
│ ├── project_structure.txt
|
|
63
|
+
│ └── quickstart.md
|
|
64
|
+
├── examples
|
|
65
|
+
│ ├── advanced_usage.py
|
|
66
|
+
│ └── basic_usage.py
|
|
67
|
+
├── LICENSE
|
|
68
|
+
├── pyproject.toml
|
|
69
|
+
├── README.md
|
|
70
|
+
├── src
|
|
71
|
+
│ └── lshrs
|
|
72
|
+
│ ├── core
|
|
73
|
+
│ ├── encoding
|
|
74
|
+
│ ├── hashing
|
|
75
|
+
│ ├── preprocessing
|
|
76
|
+
│ └── utils
|
|
77
|
+
└── tests
|
|
78
|
+
├── fixtures
|
|
79
|
+
├── integration
|
|
80
|
+
└── unit
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
- **docs**: Contains project documentation.
|
|
84
|
+
- **examples**: Contains example scripts for using the library.
|
|
85
|
+
- **src/lshrs**: Contains the source code for the `lshrs` library.
|
|
86
|
+
- **tests**: Contains unit and integration tests.
|
|
87
|
+
|
|
88
|
+
## Installation
|
|
89
|
+
|
|
90
|
+
This project uses [Poetry](https://python-poetry.org/) for dependency management. Here’s how to set up the development environment from scratch.
|
|
91
|
+
|
|
92
|
+
### 1. Install `pipx` (Recommended)
|
|
93
|
+
|
|
94
|
+
`pipx` is a tool to help you install and run Python applications in isolated environments. It's the recommended way to install `poetry`.
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
python3 -m pip install --user pipx
|
|
98
|
+
python3 -m pipx ensurepath
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
After running this, you may need to restart your terminal for the `pipx` command to be available.
|
|
102
|
+
|
|
103
|
+
### 2. Install Poetry
|
|
104
|
+
|
|
105
|
+
Once `pipx` is installed, you can use it to install Poetry:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pipx install poetry
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 3. Set Up the Project
|
|
112
|
+
|
|
113
|
+
Now, you can set up the project itself.
|
|
114
|
+
|
|
115
|
+
1. **Clone the repository:**
|
|
116
|
+
```bash
|
|
117
|
+
git clone https://github.com/mxngjxa/lshrs.git
|
|
118
|
+
cd lshrs
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
2. **macOS Prerequisite: Install `gfortran`**
|
|
122
|
+
If you are on macOS, you will need to install a Fortran compiler for the `scipy` dependency to build correctly. The easiest way is to use [Homebrew](https://brew.sh/):
|
|
123
|
+
```bash
|
|
124
|
+
brew install gfortran
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
3. **Create a local virtual environment.**
|
|
128
|
+
It's recommended to create a virtual environment in the project's root directory.
|
|
129
|
+
```bash
|
|
130
|
+
python -m venv .venv
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
4. **Configure Poetry to use the local virtual environment.**
|
|
134
|
+
This step ensures that Poetry installs dependencies into the `.venv` directory you just created.
|
|
135
|
+
```bash
|
|
136
|
+
poetry config virtualenvs.in-project true
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
5. **Install dependencies.**
|
|
140
|
+
Finally, use Poetry to install the project's dependencies.
|
|
141
|
+
```bash
|
|
142
|
+
poetry install
|
|
143
|
+
```
|
|
144
|
+
This will install all the dependencies defined in the `pyproject.toml` file.
|
|
145
|
+
|
|
146
|
+
## Usage
|
|
147
|
+
|
|
148
|
+
You can find basic and advanced usage examples in the `examples` directory.
|
|
149
|
+
|
|
150
|
+
- [`basic_usage.py`](examples/basic_usage.py:1)
|
|
151
|
+
- [`advanced_usage.py`](examples/advanced_usage.py:1)
|
|
152
|
+
|
|
153
|
+
## Development
|
|
154
|
+
|
|
155
|
+
This project uses `ruff` for linting and formatting.
|
|
156
|
+
|
|
157
|
+
- **Linting:** To check for any style issues or errors, run the following command:
|
|
158
|
+
```bash
|
|
159
|
+
poetry run ruff check src
|
|
160
|
+
```
|
|
161
|
+
- **Formatting:** To automatically fix any issues that `ruff` finds, run this command:
|
|
162
|
+
```bash
|
|
163
|
+
poetry run ruff check --fix src
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Architecture
|
|
167
|
+
|
|
168
|
+
The following diagram illustrates the architecture of the LSH recommendation system:
|
|
169
|
+
|
|
170
|
+
```mermaid
|
|
171
|
+
---
|
|
172
|
+
config:
|
|
173
|
+
layout: dagre
|
|
174
|
+
---
|
|
175
|
+
flowchart TD
|
|
176
|
+
|
|
177
|
+
A[("Text Documents")]
|
|
178
|
+
A --> B["**DataLoader**
|
|
179
|
+
- Indexing
|
|
180
|
+
- Full Representation
|
|
181
|
+
- Signature
|
|
182
|
+
- Embeddings"]
|
|
183
|
+
|
|
184
|
+
B --> C["Preprocessing"]
|
|
185
|
+
|
|
186
|
+
subgraph Preprocessing
|
|
187
|
+
direction LR
|
|
188
|
+
C --> D["Tokenize"]
|
|
189
|
+
D --> E["Lemmatize"]
|
|
190
|
+
E --> F["Remove Stopwords"]
|
|
191
|
+
F --> G["Shingling"]
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
C --> H["Vectorization"]
|
|
195
|
+
|
|
196
|
+
subgraph Vectorization
|
|
197
|
+
direction LR
|
|
198
|
+
H --> I["TF-IDF"]
|
|
199
|
+
H --> J["One-Hot Encoding"]
|
|
200
|
+
H --> K["Embeddings"]
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
I --> L["Cosine Similarity"]
|
|
204
|
+
J --> M["Jaccard Similarity"]
|
|
205
|
+
K --> L
|
|
206
|
+
|
|
207
|
+
subgraph Hashing
|
|
208
|
+
direction LR
|
|
209
|
+
L --> N["Hyperplane Hashing"]
|
|
210
|
+
M --> O["MinHash"]
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
N --> P["LSH"]
|
|
214
|
+
O --> P
|
|
215
|
+
|
|
216
|
+
P --> Q["Candidate Pairs"]
|
|
217
|
+
Q --> R["Similarity Calculation"]
|
|
218
|
+
R --> S["Top-N Recommendations"]
|
|
219
|
+
|
|
220
|
+
S --> T["Output"]
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Core Orchestration of the `lshrs` Library
|
|
224
|
+
|
|
225
|
+
This directory contains the source code for the `lshrs` library, a Python-based recommendation system using Locality Sensitive Hashing (LSH).
|
|
226
|
+
|
|
227
|
+
### Modules
|
|
228
|
+
|
|
229
|
+
The `lshrs` library is organized into the following modules:
|
|
230
|
+
|
|
231
|
+
- [Core](#core)
|
|
232
|
+
- [Encoding](#encoding)
|
|
233
|
+
- [Hashing](#hashing)
|
|
234
|
+
- [Preprocessing](#preprocessing)
|
|
235
|
+
- [Utils](#utils)
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
#### Core
|
|
240
|
+
|
|
241
|
+
The `core` module contains the main components for running the LSH recommendation system.
|
|
242
|
+
|
|
243
|
+
- **`config.py`**: Defines configuration settings for the application.
|
|
244
|
+
- **`dataloader.py`**: Handles loading and preparing data for the LSH process.
|
|
245
|
+
- **`exceptions.py`**: Defines custom exception classes for error handling.
|
|
246
|
+
- **`interfaces.py`**: Contains interface definitions for different components.
|
|
247
|
+
- **`main.py`**: The main entry point for running the LSH recommendation system.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
#### Encoding
|
|
252
|
+
|
|
253
|
+
The `encoding` module provides different methods for vectorizing text data.
|
|
254
|
+
|
|
255
|
+
- **`embedding.py`**: Implements word embedding techniques.
|
|
256
|
+
- **`main.py`**: Main script for handling encoding processes.
|
|
257
|
+
- **`onehot.py`**: Implements one-hot encoding.
|
|
258
|
+
- **`tfidf.py`**: Implements TF-IDF (Term Frequency-Inverse Document Frequency) vectorization.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
#### Hashing
|
|
263
|
+
|
|
264
|
+
The `hashing` module contains different hashing algorithms used in LSH.
|
|
265
|
+
|
|
266
|
+
- **`hyperplane.py`**: Implements hyperplane-based hashing for cosine similarity.
|
|
267
|
+
- **`lsh.py`**: The main LSH implementation that combines hashing and candidate selection.
|
|
268
|
+
- **`minhash.py`**: Implements MinHash for Jaccard similarity.
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
#### Preprocessing
|
|
273
|
+
|
|
274
|
+
The `preprocessing` module provides tools for cleaning and preparing text data.
|
|
275
|
+
|
|
276
|
+
- **`lemmatize.py`**: Implements lemmatization to reduce words to their base form.
|
|
277
|
+
- **`shingling.py`**: Implements shingling to create k-shingles from text.
|
|
278
|
+
- **`stopwords.py`**: Provides functionality for removing stopwords.
|
|
279
|
+
- **`website.py`**: Contains functions for preprocessing website content.
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
#### Utils
|
|
284
|
+
|
|
285
|
+
The `utils` module contains helper functions and utilities used across the library.
|
|
286
|
+
|
|
287
|
+
- **`br.py`**: Implements the band-and-row (BR) technique for LSH.
|
|
288
|
+
- **`helpers.py`**: Contains general helper functions.
|
|
289
|
+
- **`save.py`**: Provides functionality for saving and loading data.
|
|
290
|
+
- **`similarity.py`**: Contains functions for calculating similarity between vectors.
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## License
|
|
295
|
+
|
|
296
|
+
This project is licensed under the MIT License. See the [`LICENSE`](LICENSE:1) file for details.
|
|
297
|
+
|
|
298
|
+
## Authors
|
|
299
|
+
|
|
300
|
+
- Y. Zhao ([yimingzhao936@gmail.com](mailto:yimingzhao936@gmail.com))
|
|
301
|
+
- M. Guan ([mingjia.guan@outlook.com](mailto:mingjia.guan@outlook.com))
|
|
302
|
+
|
|
303
|
+
## Changelog
|
|
304
|
+
|
|
305
|
+
See the [`CHANGELOG.md`](CHANGELOG.md:1) file for a history of changes to the project.
|
|
306
|
+
|
|
307
|
+
## Contributing
|
|
308
|
+
|
|
309
|
+
Contributions are welcome! Please see the [Development](#development) section for linting and formatting guidelines.
|
lshrs-0.1.0/README.md
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
# Locality Sensitive Hashing Recommendation System
|
|
2
|
+
|
|
3
|
+
[](LICENSE)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://pypi.org/project/locality-sensitive-hashing-recommendation-system/)
|
|
6
|
+
[](https://github.com/mxngjxa/lshrs/actions/workflows/lint.yml)
|
|
7
|
+
[](https://github.com/mxngjxa/lshrs/deployments)
|
|
8
|
+
[](https://github.com/astral-sh/ruff)
|
|
9
|
+
[](https://pypi.org/project/lshrs/)
|
|
10
|
+
[](https://GitHub.com/mxngjxa/lshrs/graphs/contributors/)
|
|
11
|
+
[](https://GitHub.com/mxngjxa/lshrs/graphs/commit-activity)
|
|
12
|
+
|
|
13
|
+
A Locality Sensitive Hashing (LSH) based recommendation system for efficient similarity search in Python.
|
|
14
|
+
|
|
15
|
+
## Table of Contents
|
|
16
|
+
|
|
17
|
+
- [Project Structure](#project-structure)
|
|
18
|
+
- [Installation](#installation)
|
|
19
|
+
- [Usage](#usage)
|
|
20
|
+
- [Development](#development)
|
|
21
|
+
- [Architecture](#architecture)
|
|
22
|
+
- [License](#license)
|
|
23
|
+
- [Authors](#authors)
|
|
24
|
+
- [Changelog](#changelog)
|
|
25
|
+
- [Contributing](#contributing)
|
|
26
|
+
|
|
27
|
+
## Project Structure
|
|
28
|
+
|
|
29
|
+
The project is organized as follows:
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
.
|
|
33
|
+
├── CHANGELOG.md
|
|
34
|
+
├── docs
|
|
35
|
+
│ ├── api
|
|
36
|
+
│ ├── architecture.md
|
|
37
|
+
│ ├── examples
|
|
38
|
+
│ ├── index.md
|
|
39
|
+
│ ├── project_structure.txt
|
|
40
|
+
│ └── quickstart.md
|
|
41
|
+
├── examples
|
|
42
|
+
│ ├── advanced_usage.py
|
|
43
|
+
│ └── basic_usage.py
|
|
44
|
+
├── LICENSE
|
|
45
|
+
├── pyproject.toml
|
|
46
|
+
├── README.md
|
|
47
|
+
├── src
|
|
48
|
+
│ └── lshrs
|
|
49
|
+
│ ├── core
|
|
50
|
+
│ ├── encoding
|
|
51
|
+
│ ├── hashing
|
|
52
|
+
│ ├── preprocessing
|
|
53
|
+
│ └── utils
|
|
54
|
+
└── tests
|
|
55
|
+
├── fixtures
|
|
56
|
+
├── integration
|
|
57
|
+
└── unit
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
- **docs**: Contains project documentation.
|
|
61
|
+
- **examples**: Contains example scripts for using the library.
|
|
62
|
+
- **src/lshrs**: Contains the source code for the `lshrs` library.
|
|
63
|
+
- **tests**: Contains unit and integration tests.
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
This project uses [Poetry](https://python-poetry.org/) for dependency management. Here’s how to set up the development environment from scratch.
|
|
68
|
+
|
|
69
|
+
### 1. Install `pipx` (Recommended)
|
|
70
|
+
|
|
71
|
+
`pipx` is a tool to help you install and run Python applications in isolated environments. It's the recommended way to install `poetry`.
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
python3 -m pip install --user pipx
|
|
75
|
+
python3 -m pipx ensurepath
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
After running this, you may need to restart your terminal for the `pipx` command to be available.
|
|
79
|
+
|
|
80
|
+
### 2. Install Poetry
|
|
81
|
+
|
|
82
|
+
Once `pipx` is installed, you can use it to install Poetry:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pipx install poetry
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 3. Set Up the Project
|
|
89
|
+
|
|
90
|
+
Now, you can set up the project itself.
|
|
91
|
+
|
|
92
|
+
1. **Clone the repository:**
|
|
93
|
+
```bash
|
|
94
|
+
git clone https://github.com/mxngjxa/lshrs.git
|
|
95
|
+
cd lshrs
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
2. **macOS Prerequisite: Install `gfortran`**
|
|
99
|
+
If you are on macOS, you will need to install a Fortran compiler for the `scipy` dependency to build correctly. The easiest way is to use [Homebrew](https://brew.sh/):
|
|
100
|
+
```bash
|
|
101
|
+
brew install gfortran
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
3. **Create a local virtual environment.**
|
|
105
|
+
It's recommended to create a virtual environment in the project's root directory.
|
|
106
|
+
```bash
|
|
107
|
+
python -m venv .venv
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
4. **Configure Poetry to use the local virtual environment.**
|
|
111
|
+
This step ensures that Poetry installs dependencies into the `.venv` directory you just created.
|
|
112
|
+
```bash
|
|
113
|
+
poetry config virtualenvs.in-project true
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
5. **Install dependencies.**
|
|
117
|
+
Finally, use Poetry to install the project's dependencies.
|
|
118
|
+
```bash
|
|
119
|
+
poetry install
|
|
120
|
+
```
|
|
121
|
+
This will install all the dependencies defined in the `pyproject.toml` file.
|
|
122
|
+
|
|
123
|
+
## Usage
|
|
124
|
+
|
|
125
|
+
You can find basic and advanced usage examples in the `examples` directory.
|
|
126
|
+
|
|
127
|
+
- [`basic_usage.py`](examples/basic_usage.py:1)
|
|
128
|
+
- [`advanced_usage.py`](examples/advanced_usage.py:1)
|
|
129
|
+
|
|
130
|
+
## Development
|
|
131
|
+
|
|
132
|
+
This project uses `ruff` for linting and formatting.
|
|
133
|
+
|
|
134
|
+
- **Linting:** To check for any style issues or errors, run the following command:
|
|
135
|
+
```bash
|
|
136
|
+
poetry run ruff check src
|
|
137
|
+
```
|
|
138
|
+
- **Formatting:** To automatically fix any issues that `ruff` finds, run this command:
|
|
139
|
+
```bash
|
|
140
|
+
poetry run ruff check --fix src
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Architecture
|
|
144
|
+
|
|
145
|
+
The following diagram illustrates the architecture of the LSH recommendation system:
|
|
146
|
+
|
|
147
|
+
```mermaid
|
|
148
|
+
---
|
|
149
|
+
config:
|
|
150
|
+
layout: dagre
|
|
151
|
+
---
|
|
152
|
+
flowchart TD
|
|
153
|
+
|
|
154
|
+
A[("Text Documents")]
|
|
155
|
+
A --> B["**DataLoader**
|
|
156
|
+
- Indexing
|
|
157
|
+
- Full Representation
|
|
158
|
+
- Signature
|
|
159
|
+
- Embeddings"]
|
|
160
|
+
|
|
161
|
+
B --> C["Preprocessing"]
|
|
162
|
+
|
|
163
|
+
subgraph Preprocessing
|
|
164
|
+
direction LR
|
|
165
|
+
C --> D["Tokenize"]
|
|
166
|
+
D --> E["Lemmatize"]
|
|
167
|
+
E --> F["Remove Stopwords"]
|
|
168
|
+
F --> G["Shingling"]
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
C --> H["Vectorization"]
|
|
172
|
+
|
|
173
|
+
subgraph Vectorization
|
|
174
|
+
direction LR
|
|
175
|
+
H --> I["TF-IDF"]
|
|
176
|
+
H --> J["One-Hot Encoding"]
|
|
177
|
+
H --> K["Embeddings"]
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
I --> L["Cosine Similarity"]
|
|
181
|
+
J --> M["Jaccard Similarity"]
|
|
182
|
+
K --> L
|
|
183
|
+
|
|
184
|
+
subgraph Hashing
|
|
185
|
+
direction LR
|
|
186
|
+
L --> N["Hyperplane Hashing"]
|
|
187
|
+
M --> O["MinHash"]
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
N --> P["LSH"]
|
|
191
|
+
O --> P
|
|
192
|
+
|
|
193
|
+
P --> Q["Candidate Pairs"]
|
|
194
|
+
Q --> R["Similarity Calculation"]
|
|
195
|
+
R --> S["Top-N Recommendations"]
|
|
196
|
+
|
|
197
|
+
S --> T["Output"]
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Core Orchestration of the `lshrs` Library
|
|
201
|
+
|
|
202
|
+
This directory contains the source code for the `lshrs` library, a Python-based recommendation system using Locality Sensitive Hashing (LSH).
|
|
203
|
+
|
|
204
|
+
### Modules
|
|
205
|
+
|
|
206
|
+
The `lshrs` library is organized into the following modules:
|
|
207
|
+
|
|
208
|
+
- [Core](#core)
|
|
209
|
+
- [Encoding](#encoding)
|
|
210
|
+
- [Hashing](#hashing)
|
|
211
|
+
- [Preprocessing](#preprocessing)
|
|
212
|
+
- [Utils](#utils)
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
#### Core
|
|
217
|
+
|
|
218
|
+
The `core` module contains the main components for running the LSH recommendation system.
|
|
219
|
+
|
|
220
|
+
- **`config.py`**: Defines configuration settings for the application.
|
|
221
|
+
- **`dataloader.py`**: Handles loading and preparing data for the LSH process.
|
|
222
|
+
- **`exceptions.py`**: Defines custom exception classes for error handling.
|
|
223
|
+
- **`interfaces.py`**: Contains interface definitions for different components.
|
|
224
|
+
- **`main.py`**: The main entry point for running the LSH recommendation system.
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
#### Encoding
|
|
229
|
+
|
|
230
|
+
The `encoding` module provides different methods for vectorizing text data.
|
|
231
|
+
|
|
232
|
+
- **`embedding.py`**: Implements word embedding techniques.
|
|
233
|
+
- **`main.py`**: Main script for handling encoding processes.
|
|
234
|
+
- **`onehot.py`**: Implements one-hot encoding.
|
|
235
|
+
- **`tfidf.py`**: Implements TF-IDF (Term Frequency-Inverse Document Frequency) vectorization.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
#### Hashing
|
|
240
|
+
|
|
241
|
+
The `hashing` module contains different hashing algorithms used in LSH.
|
|
242
|
+
|
|
243
|
+
- **`hyperplane.py`**: Implements hyperplane-based hashing for cosine similarity.
|
|
244
|
+
- **`lsh.py`**: The main LSH implementation that combines hashing and candidate selection.
|
|
245
|
+
- **`minhash.py`**: Implements MinHash for Jaccard similarity.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
#### Preprocessing
|
|
250
|
+
|
|
251
|
+
The `preprocessing` module provides tools for cleaning and preparing text data.
|
|
252
|
+
|
|
253
|
+
- **`lemmatize.py`**: Implements lemmatization to reduce words to their base form.
|
|
254
|
+
- **`shingling.py`**: Implements shingling to create k-shingles from text.
|
|
255
|
+
- **`stopwords.py`**: Provides functionality for removing stopwords.
|
|
256
|
+
- **`website.py`**: Contains functions for preprocessing website content.
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
#### Utils
|
|
261
|
+
|
|
262
|
+
The `utils` module contains helper functions and utilities used across the library.
|
|
263
|
+
|
|
264
|
+
- **`br.py`**: Implements the band-and-row (BR) technique for LSH.
|
|
265
|
+
- **`helpers.py`**: Contains general helper functions.
|
|
266
|
+
- **`save.py`**: Provides functionality for saving and loading data.
|
|
267
|
+
- **`similarity.py`**: Contains functions for calculating similarity between vectors.
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## License
|
|
272
|
+
|
|
273
|
+
This project is licensed under the MIT License. See the [`LICENSE`](LICENSE:1) file for details.
|
|
274
|
+
|
|
275
|
+
## Authors
|
|
276
|
+
|
|
277
|
+
- Y. Zhao ([yimingzhao936@gmail.com](mailto:yimingzhao936@gmail.com))
|
|
278
|
+
- M. Guan ([mingjia.guan@outlook.com](mailto:mingjia.guan@outlook.com))
|
|
279
|
+
|
|
280
|
+
## Changelog
|
|
281
|
+
|
|
282
|
+
See the [`CHANGELOG.md`](CHANGELOG.md:1) file for a history of changes to the project.
|
|
283
|
+
|
|
284
|
+
## Contributing
|
|
285
|
+
|
|
286
|
+
Contributions are welcome! Please see the [Development](#development) section for linting and formatting guidelines.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "lshrs"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A Locality Sensitive Hashing based recommendation system for efficient similarity search."
|
|
5
|
+
authors = ["Y. Zhao <yimingzhao936@gmail.com>", "M. Guan <mingjia.guan@outlook.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
homepage = "https://github.com/mxngjxa/lshrs"
|
|
9
|
+
repository = "https://github.com/mxngjxa/lshrs"
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Programming Language :: Python :: 3",
|
|
12
|
+
"Operating System :: OS Independent",
|
|
13
|
+
]
|
|
14
|
+
packages = [{include = "lshrs", from = "src"}]
|
|
15
|
+
|
|
16
|
+
[tool.poetry.dependencies]
|
|
17
|
+
python = ">3.11"
|
|
18
|
+
scipy = "*"
|
|
19
|
+
scikit-learn = "*"
|
|
20
|
+
numpy = "*"
|
|
21
|
+
nltk = "*"
|
|
22
|
+
twine = "*"
|
|
23
|
+
build = "*"
|
|
24
|
+
|
|
25
|
+
[tool.poetry.group.dev.dependencies]
|
|
26
|
+
ruff = "*"
|
|
27
|
+
|
|
28
|
+
[tool.ruff]
|
|
29
|
+
line-length = 88
|
|
30
|
+
select = ["E", "F", "W", "I"]
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["poetry-core"]
|
|
34
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for LSH-based recommendation system.
|
|
3
|
+
Provides configuration, interfaces, and main orchestration components.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .config import EncodingConfig, LSHConfig, RecommenderConfig
|
|
7
|
+
from .exceptions import (
|
|
8
|
+
ConfigurationError,
|
|
9
|
+
DataProcessingError,
|
|
10
|
+
LSHError,
|
|
11
|
+
RecommenderError,
|
|
12
|
+
)
|
|
13
|
+
from .interfaces import BaseEncoder, BaseHasher, BaseRecommender, BaseSimilarity
|
|
14
|
+
from .main import LSHRecommender, RecommendationPipeline
|
|
15
|
+
|
|
16
|
+
__version__ = "0.0.1"
|
|
17
|
+
__author__ = "Y. Zhao, M. Guan"
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"RecommenderConfig",
|
|
21
|
+
"LSHConfig",
|
|
22
|
+
"EncodingConfig",
|
|
23
|
+
"RecommenderError",
|
|
24
|
+
"ConfigurationError",
|
|
25
|
+
"DataProcessingError",
|
|
26
|
+
"LSHError",
|
|
27
|
+
"BaseEncoder",
|
|
28
|
+
"BaseHasher",
|
|
29
|
+
"BaseRecommender",
|
|
30
|
+
"BaseSimilarity",
|
|
31
|
+
"RecommendationPipeline",
|
|
32
|
+
"LSHRecommender"
|
|
33
|
+
]
|