polydup 0.5.3__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polydup/__init__.py
ADDED
|
Binary file
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: polydup
|
|
3
|
+
Version: 0.5.3
|
|
4
|
+
Classifier: Development Status :: 3 - Alpha
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
8
|
+
Classifier: Programming Language :: Rust
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Summary: Cross-language duplicate code detector
|
|
16
|
+
Keywords: duplicate,code,detection,rust,tree-sitter
|
|
17
|
+
License: MIT OR Apache-2.0
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
20
|
+
|
|
21
|
+
# PolyDup Python Bindings
|
|
22
|
+
|
|
23
|
+
Python bindings for **PolyDup**, a cross-language duplicate code detector powered by Tree-sitter and Rabin-Karp hashing.
|
|
24
|
+
|
|
25
|
+
## Features
|
|
26
|
+
|
|
27
|
+
- **Multi-language support**: Detect duplicates across Rust, Python, and JavaScript/TypeScript
|
|
28
|
+
- **Type-2 clone detection**: Finds structurally similar code (normalized identifiers/literals)
|
|
29
|
+
- **GIL-free scanning**: Releases Python's Global Interpreter Lock during CPU-intensive operations
|
|
30
|
+
- **Parallel processing**: Built on Rayon for multi-core performance
|
|
31
|
+
- **Zero-copy architecture**: Direct FFI to Rust core for minimal overhead
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### From Source (Development)
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
cd crates/polydup-py
|
|
39
|
+
maturin develop --release
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### From PyPI (Future)
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install polydup
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
### Basic Example
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import polydup
|
|
54
|
+
|
|
55
|
+
# Scan a directory for duplicates
|
|
56
|
+
report = polydup.find_duplicates(
|
|
57
|
+
paths=['./src', './lib'],
|
|
58
|
+
min_block_size=50, # Minimum tokens per code block
|
|
59
|
+
threshold=0.85 # 85% similarity threshold
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
print(f"Scanned {report.files_scanned} files")
|
|
63
|
+
print(f"Analyzed {report.functions_analyzed} functions")
|
|
64
|
+
print(f"Found {len(report.duplicates)} duplicates")
|
|
65
|
+
print(f"Took {report.stats.duration_ms}ms")
|
|
66
|
+
|
|
67
|
+
# Iterate through duplicates
|
|
68
|
+
for dup in report.duplicates:
|
|
69
|
+
print(f"\n{dup.file1}:{dup.start_line1} ↔️ {dup.file2}:{dup.start_line2}")
|
|
70
|
+
print(f" Similarity: {dup.similarity * 100:.1f}%")
|
|
71
|
+
print(f" Length: {dup.length} tokens")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Dictionary Output
|
|
75
|
+
|
|
76
|
+
For JSON serialization or dict-based workflows:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import polydup
|
|
80
|
+
import json
|
|
81
|
+
|
|
82
|
+
report_dict = polydup.find_duplicates_dict(
|
|
83
|
+
paths=['./src'],
|
|
84
|
+
min_block_size=30,
|
|
85
|
+
threshold=0.9
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Serialize to JSON
|
|
89
|
+
print(json.dumps(report_dict, indent=2))
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Concurrent Execution
|
|
93
|
+
|
|
94
|
+
**Critical**: PolyDup releases the GIL during scanning, allowing concurrent Python code:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import polydup
|
|
98
|
+
import concurrent.futures
|
|
99
|
+
|
|
100
|
+
def scan_project(path):
|
|
101
|
+
return polydup.find_duplicates([path])
|
|
102
|
+
|
|
103
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
104
|
+
# These scans run in parallel thanks to GIL release
|
|
105
|
+
futures = [
|
|
106
|
+
executor.submit(scan_project, './project1'),
|
|
107
|
+
executor.submit(scan_project, './project2'),
|
|
108
|
+
executor.submit(scan_project, './project3'),
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
for future in concurrent.futures.as_completed(futures):
|
|
112
|
+
report = future.result()
|
|
113
|
+
print(f"Found {len(report.duplicates)} duplicates")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## API Reference
|
|
117
|
+
|
|
118
|
+
### `find_duplicates(paths, min_block_size=50, threshold=0.85)`
|
|
119
|
+
|
|
120
|
+
Scan files for duplicate code and return a `Report` object.
|
|
121
|
+
|
|
122
|
+
**Parameters:**
|
|
123
|
+
- `paths` (list[str]): List of file or directory paths to scan
|
|
124
|
+
- `min_block_size` (int, optional): Minimum code block size in tokens. Default: 50
|
|
125
|
+
- `threshold` (float, optional): Similarity threshold (0.0-1.0). Default: 0.85
|
|
126
|
+
|
|
127
|
+
**Returns:** `Report` object with scan results
|
|
128
|
+
|
|
129
|
+
**Raises:** `RuntimeError` if scanning fails
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
### `find_duplicates_dict(paths, min_block_size=50, threshold=0.85)`
|
|
134
|
+
|
|
135
|
+
Same as `find_duplicates()` but returns a Python dictionary.
|
|
136
|
+
|
|
137
|
+
**Returns:** dict with keys:
|
|
138
|
+
- `files_scanned` (int)
|
|
139
|
+
- `functions_analyzed` (int)
|
|
140
|
+
- `duplicates` (list[dict])
|
|
141
|
+
- `stats` (dict)
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
### `version()`
|
|
146
|
+
|
|
147
|
+
Get the PolyDup library version.
|
|
148
|
+
|
|
149
|
+
**Returns:** str (e.g., "0.1.0")
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
### Class: `Report`
|
|
154
|
+
|
|
155
|
+
**Attributes:**
|
|
156
|
+
- `files_scanned` (int): Number of files processed
|
|
157
|
+
- `functions_analyzed` (int): Number of functions extracted
|
|
158
|
+
- `duplicates` (list[DuplicateMatch]): List of detected duplicates
|
|
159
|
+
- `stats` (ScanStats): Performance metrics
|
|
160
|
+
|
|
161
|
+
**Methods:**
|
|
162
|
+
- `to_dict()`: Convert to Python dictionary
|
|
163
|
+
- `__len__()`: Returns number of duplicates
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
### Class: `DuplicateMatch`
|
|
168
|
+
|
|
169
|
+
**Attributes:**
|
|
170
|
+
- `file1` (str): First file path
|
|
171
|
+
- `file2` (str): Second file path
|
|
172
|
+
- `start_line1` (int): Starting line in first file
|
|
173
|
+
- `start_line2` (int): Starting line in second file
|
|
174
|
+
- `length` (int): Length in tokens
|
|
175
|
+
- `similarity` (float): Similarity score (0.0-1.0)
|
|
176
|
+
- `hash` (str): Rolling hash value (hex string)
|
|
177
|
+
|
|
178
|
+
**Methods:**
|
|
179
|
+
- `to_dict()`: Convert to Python dictionary
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
### Class: `ScanStats`
|
|
184
|
+
|
|
185
|
+
**Attributes:**
|
|
186
|
+
- `total_lines` (int): Total lines of code processed
|
|
187
|
+
- `total_tokens` (int): Total tokens analyzed
|
|
188
|
+
- `unique_hashes` (int): Number of unique code blocks
|
|
189
|
+
- `duration_ms` (int): Scan duration in milliseconds
|
|
190
|
+
|
|
191
|
+
**Methods:**
|
|
192
|
+
- `to_dict()`: Convert to Python dictionary
|
|
193
|
+
|
|
194
|
+
## Performance
|
|
195
|
+
|
|
196
|
+
PolyDup's Python bindings use `py.allow_threads()` to release the Global Interpreter Lock during scanning. This enables:
|
|
197
|
+
|
|
198
|
+
1. **Concurrent Python execution**: Other Python threads continue running
|
|
199
|
+
2. **True parallelism**: Rust's Rayon uses all CPU cores
|
|
200
|
+
3. **Minimal overhead**: Zero-copy FFI with direct Rust integration
|
|
201
|
+
|
|
202
|
+
### Benchmark Example
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
import polydup
|
|
206
|
+
import time
|
|
207
|
+
|
|
208
|
+
start = time.time()
|
|
209
|
+
report = polydup.find_duplicates(['./large-project'], min_block_size=30)
|
|
210
|
+
elapsed = time.time() - start
|
|
211
|
+
|
|
212
|
+
print(f"Scanned {report.files_scanned} files in {elapsed:.2f}s")
|
|
213
|
+
print(f"Found {len(report.duplicates)} duplicates")
|
|
214
|
+
print(f"Throughput: {report.stats.total_tokens / elapsed:.0f} tokens/sec")
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Algorithm
|
|
218
|
+
|
|
219
|
+
PolyDup uses:
|
|
220
|
+
- **Tree-sitter** for language-agnostic AST parsing
|
|
221
|
+
- **Token normalization** for Type-2 clone detection (e.g., `userId` → `$$ID`)
|
|
222
|
+
- **Rabin-Karp rolling hash** with window size 50 for efficient similarity detection
|
|
223
|
+
- **Rayon** for parallel processing across CPU cores
|
|
224
|
+
|
|
225
|
+
See [architecture-research.md](../../docs/architecture-research.md) for detailed algorithm analysis.
|
|
226
|
+
|
|
227
|
+
## Development
|
|
228
|
+
|
|
229
|
+
### Build
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
cd crates/polydup-py
|
|
233
|
+
maturin develop # Debug build
|
|
234
|
+
maturin develop --release # Optimized build
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Test
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
python test.py
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Type Checking
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
pip install mypy
|
|
247
|
+
mypy test.py
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## License
|
|
251
|
+
|
|
252
|
+
MIT OR Apache-2.0
|
|
253
|
+
|
|
254
|
+
## Links
|
|
255
|
+
|
|
256
|
+
- **GitHub**: https://github.com/wiesnerbernard/polydup
|
|
257
|
+
- **Core Library**: [polydup-core](../polydup-core)
|
|
258
|
+
- **CLI Tool**: [polydup-cli](../polydup-cli)
|
|
259
|
+
- **Node.js Bindings**: [polydup-node](../polydup-node)
|
|
260
|
+
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
polydup-0.5.3.dist-info/METADATA,sha256=JHWDoNj7s9ovxyLZQIgp0HT73KfGHJfKCr4-euz-RL0,7053
|
|
2
|
+
polydup-0.5.3.dist-info/WHEEL,sha256=diOBRAksuY94NOBHQsE7IIvZ4t9p1N5ocYOeLLhP3Ts,97
|
|
3
|
+
polydup/__init__.py,sha256=rTDn76C-DO-8IA_tPh9SE9qL1W61mz9oL2ik6Yurz9k,111
|
|
4
|
+
polydup/polydup.cp310-win_amd64.pyd,sha256=unyMscYuB4b1PN5XvcIjkE6H8TgeSQhnj5aOxIBr9L4,4631040
|
|
5
|
+
polydup-0.5.3.dist-info/RECORD,,
|