code2logic 1.0.43__tar.gz → 1.0.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {code2logic-1.0.43 → code2logic-1.0.45}/PKG-INFO +50 -37
  2. {code2logic-1.0.43 → code2logic-1.0.45}/README.md +49 -36
  3. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/__init__.py +1 -1
  4. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/analyzer.py +45 -7
  5. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/cli.py +21 -59
  6. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/similarity.py +32 -6
  7. {code2logic-1.0.43 → code2logic-1.0.45}/pyproject.toml +1 -1
  8. {code2logic-1.0.43 → code2logic-1.0.45}/LICENSE +0 -0
  9. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/__main__.py +0 -0
  10. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/adaptive.py +0 -0
  11. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/base.py +0 -0
  12. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/base_generator.py +0 -0
  13. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/benchmark.py +0 -0
  14. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/benchmarks/__init__.py +0 -0
  15. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/benchmarks/common.py +0 -0
  16. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/benchmarks/results.py +0 -0
  17. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/benchmarks/runner.py +0 -0
  18. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/chunked_reproduction.py +0 -0
  19. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/code_review.py +0 -0
  20. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/config.py +0 -0
  21. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/core/__init__.py +0 -0
  22. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/dependency.py +0 -0
  23. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/errors.py +0 -0
  24. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/file_formats.py +0 -0
  25. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/formats/__init__.py +0 -0
  26. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/function_logic.py +0 -0
  27. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/generators.py +0 -0
  28. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/gherkin.py +0 -0
  29. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/integrations/__init__.py +0 -0
  30. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/intent.py +0 -0
  31. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/llm/__init__.py +0 -0
  32. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/llm.py +0 -0
  33. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/llm_clients.py +0 -0
  34. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/llm_profiler.py +0 -0
  35. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/logicml.py +0 -0
  36. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/markdown_format.py +0 -0
  37. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/mcp_server.py +0 -0
  38. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/metrics.py +0 -0
  39. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/models.py +0 -0
  40. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/parsers.py +0 -0
  41. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/project_comparison.md +0 -0
  42. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/project_reproducer.py +0 -0
  43. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/prompts.py +0 -0
  44. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/py.typed +0 -0
  45. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/quality.py +0 -0
  46. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/refactor.py +0 -0
  47. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/reproducer.py +0 -0
  48. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/reproduction.py +0 -0
  49. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/schemas/__init__.py +0 -0
  50. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/schemas/json_schema.py +0 -0
  51. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/schemas/logicml_schema.py +0 -0
  52. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/schemas/markdown_schema.py +0 -0
  53. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/schemas/yaml_schema.py +0 -0
  54. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/shared_utils.py +0 -0
  55. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/terminal.py +0 -0
  56. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/tools/__init__.py +0 -0
  57. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/toon_format.py +0 -0
  58. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/universal.py +0 -0
  59. {code2logic-1.0.43 → code2logic-1.0.45}/code2logic/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code2logic
3
- Version: 1.0.43
3
+ Version: 1.0.45
4
4
  Summary: Code2Logic - Source code to logical representation converter for LLM analysis, featuring Tree-sitter parsing, dependency graph analysis, and multi-language support.
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
@@ -94,10 +94,18 @@ pip install code2logic[nlp] # Enhanced intents
94
94
 
95
95
  ## 📖 Quick Start
96
96
  ```bash
97
- code2logic ./ -f yaml --compact --function-logic --with-schema -o project.yaml
98
- code2logic ./ -f toon --function-logic function.toon --with-schema --name project -o ./
99
- # Optional: include function intent/purpose column in function.toon
100
- code2logic ./ -f toon --function-logic function.toon --does --name project -o ./
97
+ # TOON compact (best token efficiency 5.9x smaller than JSON)
98
+ code2logic ./ -f toon --compact --name project -o ./
99
+
100
+ # TOON with function-logic + structural context
101
+ code2logic ./ -f toon --compact --no-repeat-module \
102
+ --function-logic function.toon --function-logic-context minimal --name project -o ./
103
+
104
+ # TOON-Hybrid (project structure + function details for hub modules)
105
+ code2logic ./ -f toon --hybrid --no-repeat-module --name project -o ./
106
+
107
+ # YAML compact (human-readable, good compromise)
108
+ code2logic ./ -f yaml --compact --name project -o ./
101
109
  ```
102
110
 
103
111
  ### Command Line
@@ -312,14 +320,20 @@ Similar Functions:
312
320
 
313
321
  ```text
314
322
  code2logic/
315
- ├── analyzer.py # Main orchestrator
316
- ├── parsers.py # Tree-sitter + fallback parser
317
- ├── dependency.py # NetworkX dependency analysis
318
- ├── similarity.py # Rapidfuzz similar detection
319
- ├── intent.py # NLP intent generation
320
- ├── generators.py # Output generators (MD/Compact/JSON)
321
- ├── models.py # Data structures
322
- └── cli.py # Command-line interface
323
+ ├── analyzer.py # Main orchestrator
324
+ ├── parsers.py # Tree-sitter + fallback parser
325
+ ├── dependency.py # NetworkX dependency analysis
326
+ ├── similarity.py # Rapidfuzz similar detection
327
+ ├── intent.py # NLP intent generation
328
+ ├── generators.py # Output generators (MD/Compact/JSON/YAML/CSV)
329
+ ├── toon_format.py # TOON generator (compact, hybrid)
330
+ ├── logicml.py # LogicML generator (typed signatures)
331
+ ├── function_logic.py # Function-logic TOON with structural context
332
+ ├── metrics.py # AST-based quality metrics
333
+ ├── models.py # Data structures
334
+ ├── cli.py # Command-line interface
335
+ ├── benchmarks/ # Benchmark runner, results, common utils
336
+ └── llm_clients.py # Unified LLM client (OpenRouter/Ollama/LiteLLM)
323
337
  ```
324
338
 
325
339
  ## 🔌 Integration Examples
@@ -415,40 +429,39 @@ Compact format is ~10-15x smaller than Markdown.
415
429
 
416
430
  ## 🔬 Code Reproduction Benchmarks
417
431
 
418
- Code2Logic can reproduce code from specifications using LLMs. Benchmark results:
432
+ Benchmark results (20 files, model: `arcee-ai/trinity-large-preview`, 2026-02-25):
419
433
 
420
- ### Format Comparison (Token Efficiency)
434
+ ### Project Benchmark — Format Comparison
421
435
 
422
- | Format | Score | Token Efficiency | Spec Tokens | Runs OK |
423
- | --- | --- | --- | --- | --- |
424
- | **YAML** | **71.1%** | 42.1 | **366** | 66.7% |
425
- | **Markdown** | 65.6% | **48.7** | 385 | **100%** |
426
- | JSON | 61.9% | 23.7 | 605 | 66.7% |
427
- | Gherkin | 51.3% | 19.1 | 411 | 66.7% |
436
+ | Format | Score | Syntax OK | Runs OK | ~Tokens | Efficiency (p/kT) |
437
+ |--------|------:|----------:|--------:|--------:|---------:|
438
+ | **toon** | **63,8%** | 100% | 60% | 17 875 | **3,57** |
439
+ | json | 62,9% | 100% | 60% | 104 914 | 0,60 |
440
+ | markdown | 62,5% | 100% | 55% | 36 851 | 1,70 |
441
+ | yaml | 62,4% | 100% | 55% | 68 651 | 0,91 |
442
+ | logicml | 60,4% | 100% | 55% | ~30 000 | ~2,01 |
443
+ | csv | 53,0% | 100% | 40% | 80 779 | 0,66 |
444
+ | function.toon | 49,3% | 95% | 35% | 29 271 | 1,68 |
445
+ | gherkin | 38,6% | 95% | 30% | ~25 000 | ~1,54 |
446
+
447
+ **Behavioral benchmark:** 85,7% (6/7 functions passed).
428
448
 
429
449
  ### Key Findings
430
450
 
431
- - **YAML is best for score** - 71.1% reproduction accuracy
432
- - **Markdown is best for token efficiency** - 48.7 score/1000 tokens
433
- - **YAML uses 39.6% fewer tokens than JSON** with 9.2% higher score
434
- - **Markdown has 100% runs OK** - generated code always executes
451
+ - **TOON wins on efficiency** — best score (63,8%) at 5,9x fewer tokens than JSON
452
+ - **Syntax OK = 100%** for all major formats LLM always generates valid syntax
453
+ - **function.toon paradox** — worse than project.toon despite larger file, due to missing class/module context (fixed in v1.0.43 with `--function-logic-context`)
454
+ - **gherkin/csv** poor fit for code description, their structure doesn't map to programming constructs
435
455
 
436
456
  ### Run Benchmarks
437
457
 
438
458
  ```bash
439
- # Token-aware benchmark
440
- python examples/11_token_benchmark.py --folder tests/samples/ --no-llm
441
-
442
- # Async multi-format benchmark
443
- python examples/09_async_benchmark.py --folder tests/samples/ --no-llm
444
-
445
- # Function-level reproduction
446
- python examples/10_function_reproduction.py --file tests/samples/sample_functions.py --no-llm
447
-
448
- python examples/15_unified_benchmark.py --folder tests/samples/ --no-llm
459
+ make benchmark # Full benchmark suite (requires OPENROUTER_API_KEY)
449
460
 
450
- # Terminal markdown rendering demo
451
- python examples/16_terminal_demo.py --folder tests/samples/
461
+ # Or individually:
462
+ python examples/15_unified_benchmark.py --type format --folder tests/samples/ --limit 20
463
+ python examples/15_unified_benchmark.py --type project --folder tests/samples/ --limit 20
464
+ python examples/15_unified_benchmark.py --type function --file tests/samples/sample_functions.py
452
465
  ```
453
466
 
454
467
  ## 🤝 Contributing
@@ -48,10 +48,18 @@ pip install code2logic[nlp] # Enhanced intents
48
48
 
49
49
  ## 📖 Quick Start
50
50
  ```bash
51
- code2logic ./ -f yaml --compact --function-logic --with-schema -o project.yaml
52
- code2logic ./ -f toon --function-logic function.toon --with-schema --name project -o ./
53
- # Optional: include function intent/purpose column in function.toon
54
- code2logic ./ -f toon --function-logic function.toon --does --name project -o ./
51
+ # TOON compact (best token efficiency 5.9x smaller than JSON)
52
+ code2logic ./ -f toon --compact --name project -o ./
53
+
54
+ # TOON with function-logic + structural context
55
+ code2logic ./ -f toon --compact --no-repeat-module \
56
+ --function-logic function.toon --function-logic-context minimal --name project -o ./
57
+
58
+ # TOON-Hybrid (project structure + function details for hub modules)
59
+ code2logic ./ -f toon --hybrid --no-repeat-module --name project -o ./
60
+
61
+ # YAML compact (human-readable, good compromise)
62
+ code2logic ./ -f yaml --compact --name project -o ./
55
63
  ```
56
64
 
57
65
  ### Command Line
@@ -266,14 +274,20 @@ Similar Functions:
266
274
 
267
275
  ```text
268
276
  code2logic/
269
- ├── analyzer.py # Main orchestrator
270
- ├── parsers.py # Tree-sitter + fallback parser
271
- ├── dependency.py # NetworkX dependency analysis
272
- ├── similarity.py # Rapidfuzz similar detection
273
- ├── intent.py # NLP intent generation
274
- ├── generators.py # Output generators (MD/Compact/JSON)
275
- ├── models.py # Data structures
276
- └── cli.py # Command-line interface
277
+ ├── analyzer.py # Main orchestrator
278
+ ├── parsers.py # Tree-sitter + fallback parser
279
+ ├── dependency.py # NetworkX dependency analysis
280
+ ├── similarity.py # Rapidfuzz similar detection
281
+ ├── intent.py # NLP intent generation
282
+ ├── generators.py # Output generators (MD/Compact/JSON/YAML/CSV)
283
+ ├── toon_format.py # TOON generator (compact, hybrid)
284
+ ├── logicml.py # LogicML generator (typed signatures)
285
+ ├── function_logic.py # Function-logic TOON with structural context
286
+ ├── metrics.py # AST-based quality metrics
287
+ ├── models.py # Data structures
288
+ ├── cli.py # Command-line interface
289
+ ├── benchmarks/ # Benchmark runner, results, common utils
290
+ └── llm_clients.py # Unified LLM client (OpenRouter/Ollama/LiteLLM)
277
291
  ```
278
292
 
279
293
  ## 🔌 Integration Examples
@@ -369,40 +383,39 @@ Compact format is ~10-15x smaller than Markdown.
369
383
 
370
384
  ## 🔬 Code Reproduction Benchmarks
371
385
 
372
- Code2Logic can reproduce code from specifications using LLMs. Benchmark results:
386
+ Benchmark results (20 files, model: `arcee-ai/trinity-large-preview`, 2026-02-25):
373
387
 
374
- ### Format Comparison (Token Efficiency)
388
+ ### Project Benchmark — Format Comparison
375
389
 
376
- | Format | Score | Token Efficiency | Spec Tokens | Runs OK |
377
- | --- | --- | --- | --- | --- |
378
- | **YAML** | **71.1%** | 42.1 | **366** | 66.7% |
379
- | **Markdown** | 65.6% | **48.7** | 385 | **100%** |
380
- | JSON | 61.9% | 23.7 | 605 | 66.7% |
381
- | Gherkin | 51.3% | 19.1 | 411 | 66.7% |
390
+ | Format | Score | Syntax OK | Runs OK | ~Tokens | Efficiency (p/kT) |
391
+ |--------|------:|----------:|--------:|--------:|---------:|
392
+ | **toon** | **63,8%** | 100% | 60% | 17 875 | **3,57** |
393
+ | json | 62,9% | 100% | 60% | 104 914 | 0,60 |
394
+ | markdown | 62,5% | 100% | 55% | 36 851 | 1,70 |
395
+ | yaml | 62,4% | 100% | 55% | 68 651 | 0,91 |
396
+ | logicml | 60,4% | 100% | 55% | ~30 000 | ~2,01 |
397
+ | csv | 53,0% | 100% | 40% | 80 779 | 0,66 |
398
+ | function.toon | 49,3% | 95% | 35% | 29 271 | 1,68 |
399
+ | gherkin | 38,6% | 95% | 30% | ~25 000 | ~1,54 |
400
+
401
+ **Behavioral benchmark:** 85,7% (6/7 functions passed).
382
402
 
383
403
  ### Key Findings
384
404
 
385
- - **YAML is best for score** - 71.1% reproduction accuracy
386
- - **Markdown is best for token efficiency** - 48.7 score/1000 tokens
387
- - **YAML uses 39.6% fewer tokens than JSON** with 9.2% higher score
388
- - **Markdown has 100% runs OK** - generated code always executes
405
+ - **TOON wins on efficiency** — best score (63,8%) at 5,9x fewer tokens than JSON
406
+ - **Syntax OK = 100%** for all major formats LLM always generates valid syntax
407
+ - **function.toon paradox** — worse than project.toon despite larger file, due to missing class/module context (fixed in v1.0.43 with `--function-logic-context`)
408
+ - **gherkin/csv** poor fit for code description, their structure doesn't map to programming constructs
389
409
 
390
410
  ### Run Benchmarks
391
411
 
392
412
  ```bash
393
- # Token-aware benchmark
394
- python examples/11_token_benchmark.py --folder tests/samples/ --no-llm
395
-
396
- # Async multi-format benchmark
397
- python examples/09_async_benchmark.py --folder tests/samples/ --no-llm
398
-
399
- # Function-level reproduction
400
- python examples/10_function_reproduction.py --file tests/samples/sample_functions.py --no-llm
401
-
402
- python examples/15_unified_benchmark.py --folder tests/samples/ --no-llm
413
+ make benchmark # Full benchmark suite (requires OPENROUTER_API_KEY)
403
414
 
404
- # Terminal markdown rendering demo
405
- python examples/16_terminal_demo.py --folder tests/samples/
415
+ # Or individually:
416
+ python examples/15_unified_benchmark.py --type format --folder tests/samples/ --limit 20
417
+ python examples/15_unified_benchmark.py --type project --folder tests/samples/ --limit 20
418
+ python examples/15_unified_benchmark.py --type function --file tests/samples/sample_functions.py
406
419
  ```
407
420
 
408
421
  ## 🤝 Contributing
@@ -18,7 +18,7 @@ Example:
18
18
  >>> print(output)
19
19
  """
20
20
 
21
- __version__ = "1.0.43"
21
+ __version__ = "1.0.45"
22
22
  __author__ = "Softreck"
23
23
  __email__ = "info@softreck.dev"
24
24
  __license__ = "MIT"
@@ -4,7 +4,9 @@ Main project analyzer orchestrating all analysis components.
4
4
  Provides the high-level API for analyzing codebases.
5
5
  """
6
6
 
7
+ import logging
7
8
  import sys
9
+ import time
8
10
  from collections import defaultdict
9
11
  from datetime import datetime
10
12
  from pathlib import Path
@@ -16,6 +18,8 @@ from .models import ModuleInfo, ProjectInfo
16
18
  from .parsers import TREE_SITTER_AVAILABLE, TreeSitterParser, UniversalParser
17
19
  from .similarity import RAPIDFUZZ_AVAILABLE, SimilarityDetector
18
20
 
21
+ log = logging.getLogger(__name__)
22
+
19
23
 
20
24
  class ProjectAnalyzer:
21
25
  """
@@ -103,6 +107,7 @@ class ProjectAnalyzer:
103
107
  use_treesitter: bool = True,
104
108
  verbose: bool = False,
105
109
  include_private: bool = False,
110
+ enable_similarity: bool = True,
106
111
  ):
107
112
  """
108
113
  Initialize the project analyzer.
@@ -112,10 +117,12 @@ class ProjectAnalyzer:
112
117
  use_treesitter: Whether to use Tree-sitter for parsing
113
118
  verbose: Whether to print status messages
114
119
  include_private: Whether to include private functions/classes
120
+ enable_similarity: Whether to enable similarity detection
115
121
  """
116
122
  self.root_path = Path(root_path).resolve()
117
123
  self.verbose = verbose
118
124
  self.include_private = include_private
125
+ self.enable_similarity = enable_similarity
119
126
  self.modules: List[ModuleInfo] = []
120
127
  self.languages: Dict[str, int] = defaultdict(int)
121
128
 
@@ -137,10 +144,10 @@ class ProjectAnalyzer:
137
144
  def _print_status(self):
138
145
  """Print library availability status."""
139
146
  parts = []
140
- parts.append("TS" if TREE_SITTER_AVAILABLE else "TS")
141
- parts.append("NX" if NETWORKX_AVAILABLE else "NX")
142
- parts.append("RF" if RAPIDFUZZ_AVAILABLE else "RF")
143
- parts.append("NLP" if (SPACY_AVAILABLE or NLTK_AVAILABLE) else "NLP")
147
+ parts.append("TS" if TREE_SITTER_AVAILABLE else "TS")
148
+ parts.append("NX" if NETWORKX_AVAILABLE else "NX")
149
+ parts.append("RF" if RAPIDFUZZ_AVAILABLE else "RF")
150
+ parts.append("NLP" if (SPACY_AVAILABLE or NLTK_AVAILABLE) else "NLP")
144
151
  print(f"Libs: {' '.join(parts)}", file=sys.stderr)
145
152
 
146
153
  def analyze(self) -> ProjectInfo:
@@ -150,18 +157,49 @@ class ProjectAnalyzer:
150
157
  Returns:
151
158
  ProjectInfo with complete analysis results
152
159
  """
160
+ analyze_start = time.time()
161
+
153
162
  # Scan and parse files
163
+ t0 = time.time()
154
164
  self._scan_files()
165
+ t_scan = time.time() - t0
166
+ if self.verbose:
167
+ log.info(
168
+ "Scan complete: modules=%d languages=%s time=%.2fs",
169
+ len(self.modules),
170
+ dict(self.languages),
171
+ t_scan,
172
+ )
155
173
 
156
174
  # Build dependency graph
175
+ t0 = time.time()
157
176
  dep_graph = self.dep_analyzer.build_graph(self.modules)
158
177
  dep_metrics = self.dep_analyzer.analyze_metrics()
178
+ t_dep = time.time() - t0
179
+ if self.verbose:
180
+ log.info("Dependency analysis complete: nodes=%d time=%.2fs", len(dep_graph or {}), t_dep)
159
181
 
160
182
  # Detect entry points
183
+ t0 = time.time()
161
184
  entrypoints = self._detect_entrypoints()
185
+ t_ep = time.time() - t0
186
+ if self.verbose:
187
+ log.info("Entrypoint detection complete: entrypoints=%d time=%.2fs", len(entrypoints), t_ep)
162
188
 
163
189
  # Find similar functions
164
- similar = self.sim_detector.find_similar_functions(self.modules)
190
+ similar: Dict[str, List[str]] = {}
191
+ if self.enable_similarity:
192
+ t0 = time.time()
193
+ similar = self.sim_detector.find_similar_functions(self.modules)
194
+ t_sim = time.time() - t0
195
+ if self.verbose:
196
+ log.info("Similarity detection complete: matches=%d time=%.2fs", len(similar), t_sim)
197
+ else:
198
+ if self.verbose:
199
+ log.info("Similarity detection skipped (--no-similarity)")
200
+
201
+ if self.verbose:
202
+ log.info("Total analysis time: %.2fs", time.time() - analyze_start)
165
203
 
166
204
  return ProjectInfo(
167
205
  name=self.root_path.name,
@@ -221,14 +259,14 @@ class ProjectAnalyzer:
221
259
  module = self.ts_parser.parse(rel_path, content, language)
222
260
  except Exception as e:
223
261
  if self.verbose:
224
- print(f"Tree-sitter parser failed for {rel_path}: {e}", file=sys.stderr)
262
+ log.debug("Tree-sitter parser failed for %s: %s", rel_path, e)
225
263
 
226
264
  if module is None:
227
265
  try:
228
266
  module = self.fallback_parser.parse(rel_path, content, language)
229
267
  except Exception as e:
230
268
  if self.verbose:
231
- print(f"Fallback parser failed for {rel_path}: {e}", file=sys.stderr)
269
+ log.debug("Fallback parser failed for %s: %s", rel_path, e)
232
270
  continue
233
271
 
234
272
  if module:
@@ -10,6 +10,7 @@ Usage:
10
10
 
11
11
  import argparse
12
12
  import json
13
+ import logging
13
14
  import os
14
15
  import signal
15
16
  import subprocess
@@ -508,24 +509,12 @@ def _code2logic_llm_cli(argv: list[str]) -> None:
508
509
  return
509
510
 
510
511
 
511
- def main():
512
- """Main CLI entry point."""
513
- cli_start = time.time()
514
-
515
- try:
516
- signal.signal(signal.SIGPIPE, signal.SIG_DFL)
517
- except Exception:
518
- pass
519
-
520
- if len(sys.argv) > 1 and sys.argv[1] == 'llm':
521
- _code2logic_llm_cli(sys.argv[2:])
522
- return
523
-
512
+ def main(argv=None):
524
513
  parser = argparse.ArgumentParser(
525
- prog='code2logic',
526
- description='Convert source code to logical representation for LLM analysis',
527
- formatter_class=argparse.RawDescriptionHelpFormatter,
528
- epilog='''
514
+ description='Analyze source code and generate logical representations',
515
+ formatter_class=argparse.RawDescriptionHelpFormatter
516
+ )
517
+ epilog='''
529
518
  Examples:
530
519
  code2logic /path/to/project # Standard Markdown
531
520
  code2logic /path/to/project -f csv # CSV (best for LLM, ~50% smaller)
@@ -551,41 +540,6 @@ Detail levels (columns in csv/json/yaml):
551
540
  standard - + intent, category, domain, imports (8 columns)
552
541
  full - + calls, lines, complexity, hash (16 columns)
553
542
  '''
554
- )
555
-
556
- def _maybe_print_pretty_help() -> bool:
557
- """Print colorized help as markdown when appropriate.
558
-
559
- Returns True if help was printed and the CLI should exit early.
560
- """
561
- force_pretty = os.environ.get("CODE2LOGIC_PRETTY_HELP") == "1" or bool(os.environ.get("FORCE_COLOR"))
562
- if not force_pretty:
563
- if not hasattr(sys.stdout, "isatty") or not sys.stdout.isatty():
564
- return False
565
- try:
566
- from .terminal import render
567
- except Exception:
568
- return False
569
-
570
- help_md = f"""# code2logic
571
-
572
- Convert source code to logical representation for LLM analysis.
573
-
574
- ## Usage
575
-
576
- ```bash
577
- code2logic [path] [options]
578
- ```
579
-
580
- ## Help
581
-
582
- ```text
583
- {parser.format_help().rstrip()}
584
- ```
585
- """
586
- render.markdown(help_md)
587
- return True
588
-
589
543
  parser.add_argument(
590
544
  'path',
591
545
  nargs='?',
@@ -690,6 +644,11 @@ code2logic [path] [options]
690
644
  action='store_true',
691
645
  help='Disable Tree-sitter (use fallback parser)'
692
646
  )
647
+ parser.add_argument(
648
+ '--no-similarity',
649
+ action='store_true',
650
+ help='Disable similarity detection (RapidFuzz) to speed up analysis on large projects'
651
+ )
693
652
  parser.add_argument(
694
653
  '-v', '--verbose',
695
654
  action='store_true',
@@ -732,11 +691,10 @@ code2logic [path] [options]
732
691
  )
733
692
 
734
693
  if len(sys.argv) == 1 or any(a in ("-h", "--help") for a in sys.argv[1:]):
735
- if not _maybe_print_pretty_help():
736
- parser.print_help()
694
+ parser.print_help()
737
695
  return
738
696
 
739
- args = parser.parse_args()
697
+ args = parser.parse_args(argv)
740
698
 
741
699
  if not args.no_install and os.environ.get("CODE2LOGIC_NO_INSTALL") in ("1", "true", "True", "yes", "YES"):
742
700
  args.no_install = True
@@ -750,6 +708,11 @@ code2logic [path] [options]
750
708
  # Initialize logger
751
709
  log = Logger(verbose=args.verbose, debug=args.debug)
752
710
 
711
+ logging.basicConfig(
712
+ level=(logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING)),
713
+ format='[%(levelname)s] %(message)s',
714
+ )
715
+
753
716
  if args.verbose and not args.quiet:
754
717
  log.header("CODE2LOGIC")
755
718
  log.detail(f"Version: {__version__}")
@@ -842,9 +805,7 @@ code2logic [path] [options]
842
805
 
843
806
  # Path is required for analysis
844
807
  if args.path is None:
845
- # Keep behavior consistent with --help
846
- if not _maybe_print_pretty_help():
847
- parser.print_help()
808
+ parser.print_help()
848
809
  return
849
810
 
850
811
  # Validate path
@@ -865,7 +826,8 @@ code2logic [path] [options]
865
826
  analyzer = ProjectAnalyzer(
866
827
  args.path,
867
828
  use_treesitter=not args.no_treesitter,
868
- verbose=args.debug
829
+ verbose=args.verbose or args.debug,
830
+ enable_similarity=not args.no_similarity,
869
831
  )
870
832
  project = analyzer.analyze()
871
833
  analyze_time = time.time() - analyze_start
@@ -4,11 +4,15 @@ Similarity detector using Rapidfuzz.
4
4
  Detects similar functions across modules to identify
5
5
  potential duplicates and refactoring opportunities.
6
6
  """
7
-
7
+ import logging
8
+ import time
9
+ from collections import defaultdict
8
10
  from typing import Dict, List
9
11
 
10
12
  from .models import ModuleInfo
11
13
 
14
+ log = logging.getLogger(__name__)
15
+
12
16
  # Optional Rapidfuzz import
13
17
  RAPIDFUZZ_AVAILABLE = False
14
18
  try:
@@ -43,6 +47,8 @@ class SimilarityDetector:
43
47
  threshold: Minimum similarity score (0-100) to consider as similar
44
48
  """
45
49
  self.threshold = threshold
50
+ self.max_functions = 8000
51
+ self.progress_every = 250
46
52
 
47
53
  def find_similar_functions(self, modules: List[ModuleInfo]) -> Dict[str, List[str]]:
48
54
  """
@@ -58,6 +64,8 @@ class SimilarityDetector:
58
64
  if not RAPIDFUZZ_AVAILABLE:
59
65
  return {}
60
66
 
67
+ start = time.time()
68
+
61
69
  # Collect all functions
62
70
  all_funcs: List[dict] = []
63
71
  for m in modules:
@@ -76,15 +84,35 @@ class SimilarityDetector:
76
84
  if len(all_funcs) < 2:
77
85
  return {}
78
86
 
87
+ if len(all_funcs) > self.max_functions:
88
+ log.warning(
89
+ "Skipping similarity detection: too many functions (%d > %d). Use --no-similarity to silence this.",
90
+ len(all_funcs),
91
+ self.max_functions,
92
+ )
93
+ return {}
94
+
79
95
  # Find similar functions
80
96
  similar: Dict[str, List[str]] = {}
81
97
  names = [f['name'] for f in all_funcs]
82
98
 
99
+ name_to_fulls: Dict[str, List[str]] = defaultdict(list)
100
+ for f in all_funcs:
101
+ name_to_fulls[f['name']].append(f['full'])
102
+
83
103
  for i, func in enumerate(all_funcs):
84
104
  # Skip common names that would produce false positives
85
105
  if func['name'] in ('__init__', 'constructor', 'toString', 'valueOf'):
86
106
  continue
87
107
 
108
+ if i > 0 and (i % self.progress_every) == 0:
109
+ log.debug(
110
+ "Similarity progress: %d/%d (%.2fs)",
111
+ i,
112
+ len(all_funcs),
113
+ time.time() - start,
114
+ )
115
+
88
116
  matches = process.extract(
89
117
  func['name'],
90
118
  names[:i] + names[i+1:],
@@ -95,15 +123,13 @@ class SimilarityDetector:
95
123
  sim_list = []
96
124
  for match_name, score, _ in matches:
97
125
  if score >= self.threshold and match_name != func['name']:
98
- # Find full name
99
- for other in all_funcs:
100
- if other['name'] == match_name:
101
- sim_list.append(f"{other['full']} ({score}%)")
102
- break
126
+ for full in name_to_fulls.get(match_name, [])[:3]:
127
+ sim_list.append(f"{full} ({score}%)")
103
128
 
104
129
  if sim_list:
105
130
  similar[func['full']] = sim_list
106
131
 
132
+ log.debug("Similarity finished: funcs=%d matches=%d time=%.2fs", len(all_funcs), len(similar), time.time() - start)
107
133
  return similar
108
134
 
109
135
  def find_duplicate_signatures(self, modules: List[ModuleInfo]) -> Dict[str, List[str]]:
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "code2logic"
7
- version = "1.0.43"
7
+ version = "1.0.45"
8
8
  description = "Code2Logic - Source code to logical representation converter for LLM analysis, featuring Tree-sitter parsing, dependency graph analysis, and multi-language support."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
File without changes