clonehunter 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. clonehunter-1.0.0/.gitignore +266 -0
  2. clonehunter-1.0.0/.python-version +1 -0
  3. clonehunter-1.0.0/LICENSE +21 -0
  4. clonehunter-1.0.0/PKG-INFO +308 -0
  5. clonehunter-1.0.0/README.md +274 -0
  6. clonehunter-1.0.0/assets/clonehunter-report-demo.png +0 -0
  7. clonehunter-1.0.0/examples/clonehunter_diff.json +112 -0
  8. clonehunter-1.0.0/examples/clonehunter_diff_report.html +144 -0
  9. clonehunter-1.0.0/examples/clonehunter_report.html +738 -0
  10. clonehunter-1.0.0/examples/clonehunter_report.json +1085 -0
  11. clonehunter-1.0.0/examples/clonehunter_report.sarif +969 -0
  12. clonehunter-1.0.0/fixtures/demo_monorepo/invoice_form_helpers.ts +64 -0
  13. clonehunter-1.0.0/fixtures/demo_monorepo/invoices_pipeline.py +172 -0
  14. clonehunter-1.0.0/fixtures/demo_monorepo/order_form_helpers.ts +64 -0
  15. clonehunter-1.0.0/fixtures/demo_monorepo/orders_pipeline.py +167 -0
  16. clonehunter-1.0.0/fixtures/tiny_repo/a.py +34 -0
  17. clonehunter-1.0.0/fixtures/tiny_repo/classes.py +6 -0
  18. clonehunter-1.0.0/fixtures/tiny_repo/helpers.py +5 -0
  19. clonehunter-1.0.0/main.py +6 -0
  20. clonehunter-1.0.0/pyproject.toml +74 -0
  21. clonehunter-1.0.0/src/clonehunter/__init__.py +6 -0
  22. clonehunter-1.0.0/src/clonehunter/__main__.py +4 -0
  23. clonehunter-1.0.0/src/clonehunter/_compat/toml.py +20 -0
  24. clonehunter-1.0.0/src/clonehunter/cli/__init__.py +1 -0
  25. clonehunter-1.0.0/src/clonehunter/cli/commands/__init__.py +1 -0
  26. clonehunter-1.0.0/src/clonehunter/cli/commands/diff.py +70 -0
  27. clonehunter-1.0.0/src/clonehunter/cli/commands/scan.py +257 -0
  28. clonehunter-1.0.0/src/clonehunter/cli/main.py +88 -0
  29. clonehunter-1.0.0/src/clonehunter/core/__init__.py +1 -0
  30. clonehunter-1.0.0/src/clonehunter/core/config.py +73 -0
  31. clonehunter-1.0.0/src/clonehunter/core/config_loader.py +101 -0
  32. clonehunter-1.0.0/src/clonehunter/core/errors.py +6 -0
  33. clonehunter-1.0.0/src/clonehunter/core/logging.py +16 -0
  34. clonehunter-1.0.0/src/clonehunter/core/pipeline.py +259 -0
  35. clonehunter-1.0.0/src/clonehunter/core/types.py +93 -0
  36. clonehunter-1.0.0/src/clonehunter/embedding/__init__.py +1 -0
  37. clonehunter-1.0.0/src/clonehunter/embedding/cache.py +37 -0
  38. clonehunter-1.0.0/src/clonehunter/embedding/codebert_embedder.py +88 -0
  39. clonehunter-1.0.0/src/clonehunter/embedding/stub_embedder.py +27 -0
  40. clonehunter-1.0.0/src/clonehunter/engines/__init__.py +8 -0
  41. clonehunter-1.0.0/src/clonehunter/engines/semantic_engine.py +10 -0
  42. clonehunter-1.0.0/src/clonehunter/engines/sonarqube_engine.py +100 -0
  43. clonehunter-1.0.0/src/clonehunter/index/__init__.py +1 -0
  44. clonehunter-1.0.0/src/clonehunter/index/brute_index.py +31 -0
  45. clonehunter-1.0.0/src/clonehunter/index/faiss_index.py +62 -0
  46. clonehunter-1.0.0/src/clonehunter/io/__init__.py +1 -0
  47. clonehunter-1.0.0/src/clonehunter/io/fingerprints.py +14 -0
  48. clonehunter-1.0.0/src/clonehunter/io/fs.py +90 -0
  49. clonehunter-1.0.0/src/clonehunter/io/git.py +18 -0
  50. clonehunter-1.0.0/src/clonehunter/model/__init__.py +1 -0
  51. clonehunter-1.0.0/src/clonehunter/model/interfaces.py +52 -0
  52. clonehunter-1.0.0/src/clonehunter/model/registry.py +25 -0
  53. clonehunter-1.0.0/src/clonehunter/parsing/__init__.py +1 -0
  54. clonehunter-1.0.0/src/clonehunter/parsing/python_ast.py +67 -0
  55. clonehunter-1.0.0/src/clonehunter/parsing/text_units.py +27 -0
  56. clonehunter-1.0.0/src/clonehunter/reporting/__init__.py +1 -0
  57. clonehunter-1.0.0/src/clonehunter/reporting/compare.py +32 -0
  58. clonehunter-1.0.0/src/clonehunter/reporting/html_reporter.py +360 -0
  59. clonehunter-1.0.0/src/clonehunter/reporting/json_reporter.py +88 -0
  60. clonehunter-1.0.0/src/clonehunter/reporting/sarif_reporter.py +59 -0
  61. clonehunter-1.0.0/src/clonehunter/reporting/schema.py +26 -0
  62. clonehunter-1.0.0/src/clonehunter/similarity/__init__.py +1 -0
  63. clonehunter-1.0.0/src/clonehunter/similarity/candidates.py +172 -0
  64. clonehunter-1.0.0/src/clonehunter/similarity/clustering.py +55 -0
  65. clonehunter-1.0.0/src/clonehunter/similarity/lexical.py +19 -0
  66. clonehunter-1.0.0/src/clonehunter/similarity/ranking.py +31 -0
  67. clonehunter-1.0.0/src/clonehunter/similarity/rollup.py +162 -0
  68. clonehunter-1.0.0/src/clonehunter/similarity/scoring.py +9 -0
  69. clonehunter-1.0.0/src/clonehunter/snippets/__init__.py +1 -0
  70. clonehunter-1.0.0/src/clonehunter/snippets/expansion.py +557 -0
  71. clonehunter-1.0.0/src/clonehunter/snippets/generators.py +74 -0
  72. clonehunter-1.0.0/src/clonehunter/snippets/normalization.py +37 -0
  73. clonehunter-1.0.0/tests/conftest.py +13 -0
  74. clonehunter-1.0.0/tests/test_candidates.py +25 -0
  75. clonehunter-1.0.0/tests/test_cli_entrypoints.py +83 -0
  76. clonehunter-1.0.0/tests/test_cli_glob_merge.py +100 -0
  77. clonehunter-1.0.0/tests/test_cli_integration.py +26 -0
  78. clonehunter-1.0.0/tests/test_cli_smoke.py +27 -0
  79. clonehunter-1.0.0/tests/test_clustering.py +33 -0
  80. clonehunter-1.0.0/tests/test_codebert_embedder_integration.py +46 -0
  81. clonehunter-1.0.0/tests/test_config_loader.py +26 -0
  82. clonehunter-1.0.0/tests/test_core_utils.py +17 -0
  83. clonehunter-1.0.0/tests/test_diff_e2e.py +83 -0
  84. clonehunter-1.0.0/tests/test_embedding_cache.py +15 -0
  85. clonehunter-1.0.0/tests/test_error_paths.py +41 -0
  86. clonehunter-1.0.0/tests/test_expansion.py +26 -0
  87. clonehunter-1.0.0/tests/test_faiss_index_mock.py +60 -0
  88. clonehunter-1.0.0/tests/test_git_ops.py +17 -0
  89. clonehunter-1.0.0/tests/test_io_fs.py +54 -0
  90. clonehunter-1.0.0/tests/test_parsing.py +21 -0
  91. clonehunter-1.0.0/tests/test_pipeline_smoke.py +72 -0
  92. clonehunter-1.0.0/tests/test_reporters_html_sarif.py +123 -0
  93. clonehunter-1.0.0/tests/test_reporting_json.py +24 -0
  94. clonehunter-1.0.0/tests/test_rollup.py +177 -0
  95. clonehunter-1.0.0/tests/test_snippets.py +34 -0
  96. clonehunter-1.0.0/tests/test_sonarqube_engine.py +25 -0
  97. clonehunter-1.0.0/tests/test_threshold_edges.py +122 -0
  98. clonehunter-1.0.0/typings/pytest.pyi +23 -0
  99. clonehunter-1.0.0/typings/tomllib.pyi +3 -0
  100. clonehunter-1.0.0/uv.lock +1241 -0
@@ -0,0 +1,266 @@
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,windows,venv
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos,windows,venv
3
+
4
+ ### macOS ###
5
+ # General
6
+ .DS_Store
7
+ .AppleDouble
8
+ .LSOverride
9
+
10
+ # Icon must end with two \r
11
+ Icon
12
+
13
+
14
+ # Thumbnails
15
+ ._*
16
+
17
+ # Files that might appear in the root of a volume
18
+ .DocumentRevisions-V100
19
+ .fseventsd
20
+ .Spotlight-V100
21
+ .TemporaryItems
22
+ .Trashes
23
+ .VolumeIcon.icns
24
+ .com.apple.timemachine.donotpresent
25
+
26
+ # Directories potentially created on remote AFP share
27
+ .AppleDB
28
+ .AppleDesktop
29
+ Network Trash Folder
30
+ Temporary Items
31
+ .apdisk
32
+
33
+ ### macOS Patch ###
34
+ # iCloud generated files
35
+ *.icloud
36
+
37
+ ### Python ###
38
+ # Byte-compiled / optimized / DLL files
39
+ __pycache__/
40
+ *.py[cod]
41
+ *$py.class
42
+
43
+ # C extensions
44
+ *.so
45
+
46
+ # Distribution / packaging
47
+ .Python
48
+ build/
49
+ develop-eggs/
50
+ dist/
51
+ downloads/
52
+ eggs/
53
+ .eggs/
54
+ lib/
55
+ lib64/
56
+ parts/
57
+ sdist/
58
+ var/
59
+ wheels/
60
+ share/python-wheels/
61
+ *.egg-info/
62
+ .installed.cfg
63
+ *.egg
64
+ MANIFEST
65
+
66
+ # PyInstaller
67
+ # Usually these files are written by a python script from a template
68
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
69
+ *.manifest
70
+ *.spec
71
+
72
+ # Installer logs
73
+ pip-log.txt
74
+ pip-delete-this-directory.txt
75
+
76
+ # Unit test / coverage reports
77
+ htmlcov/
78
+ .tox/
79
+ .nox/
80
+ .coverage
81
+ .coverage.*
82
+ .cache
83
+ nosetests.xml
84
+ coverage.xml
85
+ *.cover
86
+ *.py,cover
87
+ .hypothesis/
88
+ .pytest_cache/
89
+ cover/
90
+
91
+ # Translations
92
+ *.mo
93
+ *.pot
94
+
95
+ # Django stuff:
96
+ *.log
97
+ local_settings.py
98
+ db.sqlite3
99
+ db.sqlite3-journal
100
+
101
+ # Flask stuff:
102
+ instance/
103
+ .webassets-cache
104
+
105
+ # Scrapy stuff:
106
+ .scrapy
107
+
108
+ # Sphinx documentation
109
+ docs/_build/
110
+
111
+ # PyBuilder
112
+ .pybuilder/
113
+ target/
114
+
115
+ # Jupyter Notebook
116
+ .ipynb_checkpoints
117
+
118
+ # IPython
119
+ profile_default/
120
+ ipython_config.py
121
+
122
+ # pyenv
123
+ # For a library or package, you might want to ignore these files since the code is
124
+ # intended to run in multiple environments; otherwise, check them in:
125
+ # .python-version
126
+
127
+ # pipenv
128
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
129
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
130
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
131
+ # install all needed dependencies.
132
+ #Pipfile.lock
133
+
134
+ # poetry
135
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
136
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
137
+ # commonly ignored for libraries.
138
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
139
+ #poetry.lock
140
+
141
+ # pdm
142
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
143
+ #pdm.lock
144
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
145
+ # in version control.
146
+ # https://pdm.fming.dev/#use-with-ide
147
+ .pdm.toml
148
+
149
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
150
+ __pypackages__/
151
+
152
+ # Celery stuff
153
+ celerybeat-schedule
154
+ celerybeat.pid
155
+
156
+ # SageMath parsed files
157
+ *.sage.py
158
+
159
+ # Environments
160
+ .env
161
+ .venv
162
+ env/
163
+ venv/
164
+ ENV/
165
+ env.bak/
166
+ venv.bak/
167
+
168
+ # Spyder project settings
169
+ .spyderproject
170
+ .spyproject
171
+
172
+ # Rope project settings
173
+ .ropeproject
174
+
175
+ # mkdocs documentation
176
+ /site
177
+
178
+ # mypy
179
+ .mypy_cache/
180
+ .dmypy.json
181
+ dmypy.json
182
+
183
+ # Pyre type checker
184
+ .pyre/
185
+
186
+ # pytype static type analyzer
187
+ .pytype/
188
+
189
+ # Cython debug symbols
190
+ cython_debug/
191
+
192
+ # PyCharm
193
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
194
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
195
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
196
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
197
+ #.idea/
198
+
199
+ ### Python Patch ###
200
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
201
+ poetry.toml
202
+
203
+ # ruff
204
+ .ruff_cache/
205
+
206
+ # LSP config files
207
+ pyrightconfig.json
208
+
209
+ ### venv ###
210
+ # Virtualenv
211
+ # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
212
+ [Bb]in
213
+ [Ii]nclude
214
+ [Ll]ib
215
+ [Ll]ib64
216
+ [Ll]ocal
217
+ [Ss]cripts
218
+ pyvenv.cfg
219
+ pip-selfcheck.json
220
+
221
+ ### VisualStudioCode ###
222
+ .vscode/*
223
+ !.vscode/settings.json
224
+ !.vscode/tasks.json
225
+ !.vscode/launch.json
226
+ !.vscode/extensions.json
227
+ !.vscode/*.code-snippets
228
+
229
+ # Local History for Visual Studio Code
230
+ .history/
231
+
232
+ # Built Visual Studio Code Extensions
233
+ *.vsix
234
+
235
+ ### VisualStudioCode Patch ###
236
+ # Ignore all local history of files
237
+ .history
238
+ .ionide
239
+
240
+ ### Windows ###
241
+ # Windows thumbnail cache files
242
+ Thumbs.db
243
+ Thumbs.db:encryptable
244
+ ehthumbs.db
245
+ ehthumbs_vista.db
246
+
247
+ # Dump file
248
+ *.stackdump
249
+
250
+ # Folder config file
251
+ [Dd]esktop.ini
252
+
253
+ # Recycle Bin used on file shares
254
+ $RECYCLE.BIN/
255
+
256
+ # Windows Installer files
257
+ *.cab
258
+ *.msi
259
+ *.msix
260
+ *.msm
261
+ *.msp
262
+
263
+ # Windows shortcuts
264
+ *.lnk
265
+
266
+ # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,windows,venv
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 David Rogers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,308 @@
1
+ Metadata-Version: 2.4
2
+ Name: clonehunter
3
+ Version: 1.0.0
4
+ Summary: Find duplicate code in mixed-language repositories using semantic and lexical similarity
5
+ Project-URL: Homepage, https://github.com/drogers0/clonehunter
6
+ Project-URL: Source, https://github.com/drogers0/clonehunter
7
+ Project-URL: Issues, https://github.com/drogers0/clonehunter/issues
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: code-clone,developer-tools,duplicate-code,refactoring,static-analysis
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Software Development :: Quality Assurance
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: numpy>=2.0.0
23
+ Requires-Dist: tomli>=2.0.1; python_version < '3.11'
24
+ Requires-Dist: torch>=2.2.0
25
+ Requires-Dist: tqdm>=4.66.0
26
+ Requires-Dist: transformers>=4.40.0
27
+ Provides-Extra: dev
28
+ Requires-Dist: pyright>=1.1.380; extra == 'dev'
29
+ Requires-Dist: pytest>=8.4.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.15.0; extra == 'dev'
31
+ Provides-Extra: faiss
32
+ Requires-Dist: faiss-cpu>=1.7.4; extra == 'faiss'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # CloneHunter
36
+
37
+ CloneHunter finds duplicate code across mixed-language repositories. It uses function-aware analysis for Python and windows-based analysis for other code files, with evidence so you can decide what to refactor.
38
+
39
+ Under the hood, CloneHunter combines snippet generation (function/window/call-expansion), transformer-based code embeddings (CodeBERT), vector similarity search (brute-force or FAISS), and lexical scoring before rolling matches up into findings and HTML/JSON/SARIF reports. This is intentionally not a lightweight grep-style checker: it runs a semantic retrieval pipeline with model inference and indexing to catch harder, non-trivial duplicate patterns.
40
+
41
+ ![CloneHunter HTML report screenshot](assets/clonehunter-report-demo.png)
42
+
43
+ ## Quickstart
44
+
45
+ Requires Python 3.10+.
46
+
47
+ ### Install with uv
48
+
49
+ ```bash
50
+ uv python install 3.10
51
+ uv pip install git+https://github.com/drogers0/clonehunter
52
+ ```
53
+
54
+ ### Install from a release tag
55
+
56
+ ```bash
57
+ uv pip install git+https://github.com/drogers0/clonehunter@v1.0.0
58
+ ```
59
+
60
+ ### Install with venv + pip
61
+
62
+ ```bash
63
+ python3.10 -m venv .venv
64
+ source .venv/bin/activate
65
+ pip install --upgrade pip
66
+ pip install git+https://github.com/drogers0/clonehunter
67
+ ```
68
+
69
+ ### From source (dev)
70
+
71
+ ```bash
72
+ git clone https://github.com/drogers0/clonehunter
73
+ cd clonehunter
74
+ uv python install 3.10
75
+ uv sync
76
+ uv pip install -e .
77
+ ```
78
+
79
+ ### Run
80
+
81
+ ```bash
82
+ clonehunter scan . --format html --out clonehunter_report.html
83
+ ```
84
+
85
+ If the CLI entrypoint is not on your PATH, use:
86
+
87
+ ```bash
88
+ uv run clonehunter scan . --format html --out clonehunter_report.html
89
+ ```
90
+
91
+ ### Notes on dependencies
92
+
93
+ * `codebert` embedder requires `torch` and `transformers`.
94
+ * `faiss` index is optional; install `faiss-cpu` for faster search.
95
+ * Use `--embedder stub` for quick local runs without ML dependencies.
96
+
97
+ ---
98
+
99
+ ## Basic Usage
100
+
101
+ Scan a repository (defaults to HTML and `clonehunter_report.html`; output extension follows `--format`):
102
+
103
+ ```bash
104
+ uv run clonehunter scan .
105
+ ```
106
+
107
+ Generate a JSON report:
108
+
109
+ ```bash
110
+ uv run clonehunter scan . --format json --out report.json
111
+ ```
112
+
113
+ Generate an HTML report:
114
+
115
+ ```bash
116
+ uv run clonehunter scan . --format html --out report.html
117
+ ```
118
+
119
+ Generate a SARIF report:
120
+
121
+ ```bash
122
+ uv run clonehunter scan . --format sarif --out report.sarif
123
+ ```
124
+
125
+ Diff scan (compare changed files against the full repo):
126
+
127
+ ```bash
128
+ clonehunter diff --base HEAD --format json --out diff.json
129
+ ```
130
+
131
+ ### Language Support
132
+
133
+ * **Python files**: parsed with AST extraction and analyzed with function/window snippets.
134
+ * **Other code files**: analyzed in implicit windows-only mode by file content.
135
+ * **Cross-file-type comparisons** are allowed.
136
+ * Results can vary by language and repository shape; tune thresholds/windows for best quality.
137
+
138
+ ---
139
+
140
+ ## How Scoring Works
141
+
142
+ Scores are **composite**: embedding similarity blended with lexical similarity.
143
+
144
+ ```
145
+ composite = (1 - lexical_weight) * embedding + lexical_weight * lexical
146
+ ```
147
+
148
+ Matches are filtered by `lexical_min_ratio`, and then the composite score is compared against the relevant threshold (`func`, `win`, or `exp`).
149
+
150
+ ---
151
+
152
+ ## Configuration (pyproject.toml)
153
+
154
+ ```toml
155
+ [tool.clonehunter]
156
+ engine = "semantic"
157
+ cluster_findings = false
158
+ cluster_min_size = 2
159
+
160
+ [tool.clonehunter.thresholds]
161
+ func = 0.92
162
+ win = 0.90
163
+ exp = 0.90
164
+ min_window_hits = 2
165
+ lexical_min_ratio = 0.5
166
+ lexical_weight = 0.3
167
+
168
+ [tool.clonehunter.windows]
169
+ window_lines = 12
170
+ stride_lines = 6
171
+ min_nonempty = 4
172
+
173
+ [tool.clonehunter.expansion]
174
+ enabled = false
175
+ depth = 1
176
+ max_chars = 4000
177
+
178
+ [tool.clonehunter.index]
179
+ name = "brute"
180
+ top_k = 25
181
+
182
+ [tool.clonehunter.cache]
183
+ path = "~/.cache/clonehunter"
184
+
185
+ [tool.clonehunter.embedder]
186
+ name = "codebert"
187
+ model_name = "microsoft/codebert-base"
188
+ revision = "main"
189
+ max_length = 256
190
+ batch_size = 16
191
+ device = "cpu"
192
+ ```
193
+
194
+ By default, CLI scans apply the `monorepo` repotype preset unless overridden with `--repotype` or `--repotype none`.
195
+
196
+ ---
197
+
198
+ ## CLI Options (selected)
199
+
200
+ ```
201
+ clonehunter scan [PATHS...] [--format json|html|sarif] [--out FILE]
202
+ --engine semantic|sonarqube
203
+ --embedder codebert|stub
204
+ --index brute|faiss
205
+ --threshold-func FLOAT
206
+ --threshold-win FLOAT
207
+ --threshold-exp FLOAT
208
+ --min-window-hits INT
209
+ --lexical-min-ratio FLOAT
210
+ --lexical-weight FLOAT
211
+ --window-lines INT
212
+ --stride-lines INT
213
+ --min-nonempty INT
214
+ --expand-calls
215
+ --expand-depth INT
216
+ --expand-max-chars INT
217
+ --cache-path PATH
218
+ --cluster
219
+ --cluster-min-size INT
220
+ --repotype dotnet|go|java|kotlin|monorepo|node|none|php|python|react|ruby|rust|swift|cpp
221
+ # repeatable preset globs
222
+ --include-globs GLOB # repeatable; merged with config includes
223
+ --exclude-globs GLOB # repeatable; merged with config excludes
224
+
225
+ clonehunter diff --base REF [--format json|html|sarif] [--out FILE]
226
+ ```
227
+
228
+ `--repotype` is additive and can be mixed (for example `--repotype python --repotype react`).
229
+ If `--repotype` is omitted, CloneHunter defaults to `monorepo`.
230
+ Use `--repotype none` to disable repotype presets.
231
+ Merge order is: `pyproject.toml` globs, then `--repotype`, then explicit `--include-globs/--exclude-globs`.
232
+ When the same glob appears in both include/exclude sets, the most recent CLI layer wins.
233
+
234
+ Example mixed-language scan with custom overrides:
235
+
236
+ ```bash
237
+ uv run clonehunter scan . \
238
+ --repotype python \
239
+ --repotype react \
240
+ --repotype cpp \
241
+ --exclude-globs "**/__generated__/**" \
242
+ --include-globs "**/*.txt" \
243
+ --format html --out report.html
244
+ ```
245
+
246
+ ---
247
+
248
+ ## Example Outputs
249
+
250
+ Example reports live in the `examples/` folder:
251
+
252
+ * [`examples/clonehunter_report.html`](examples/clonehunter_report.html)
253
+ * [`examples/clonehunter_report.json`](examples/clonehunter_report.json)
254
+ * [`examples/clonehunter_report.sarif`](examples/clonehunter_report.sarif)
255
+ * [`examples/clonehunter_diff.json`](examples/clonehunter_diff.json)
256
+ * [`examples/clonehunter_diff_report.html`](examples/clonehunter_diff_report.html)
257
+
258
+ Generate the example reports:
259
+
260
+ ```bash
261
+ uv run clonehunter scan . --format html --out examples/clonehunter_report.html
262
+ uv run clonehunter scan . --format json --out examples/clonehunter_report.json
263
+ uv run clonehunter scan . --format sarif --out examples/clonehunter_report.sarif
264
+ uv run clonehunter diff --base HEAD --format json --out examples/clonehunter_diff.json
265
+ uv run clonehunter diff --base HEAD --format html --out examples/clonehunter_diff_report.html
266
+ ```
267
+
268
+ ---
269
+
270
+ ## Tuning Tips
271
+
272
+ * If you see too many false positives, increase `lexical_min_ratio` and/or `lexical_weight`.
273
+ * Increase `min_window_hits` to require stronger evidence.
274
+ * Exclude tests or generated code via `exclude_globs`.
275
+
276
+ ---
277
+
278
+ ## Limitations
279
+
280
+ * Semantic similarity is approximate, not guaranteed equivalence.
281
+ * Python findings are generally richer due to AST/function context.
282
+ * Non-Python findings use windows-only analysis and may require threshold/window tuning.
283
+ * Very small functions are harder to compare meaningfully.
284
+ * Domain-specific logic may require threshold tuning.
285
+
286
+ ---
287
+
288
+ ## Development
289
+
290
+ Run the full test suite:
291
+
292
+ ```bash
293
+ python -m pytest
294
+ ```
295
+
296
+ Run formatting and type checks:
297
+
298
+ ```bash
299
+ ruff format .
300
+ ruff check .
301
+ pyright
302
+ ```
303
+
304
+ Install dev dependencies:
305
+
306
+ ```bash
307
+ uv pip install -e ".[dev,faiss]"
308
+ ```