code2llm 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. code2flow/__init__.py +47 -0
  2. code2flow/__main__.py +6 -0
  3. code2flow/analysis/__init__.py +23 -0
  4. code2flow/analysis/call_graph.py +210 -0
  5. code2flow/analysis/cfg.py +293 -0
  6. code2flow/analysis/coupling.py +77 -0
  7. code2flow/analysis/data_analysis.py +249 -0
  8. code2flow/analysis/dfg.py +224 -0
  9. code2flow/analysis/pipeline_detector.py +445 -0
  10. code2flow/analysis/side_effects.py +313 -0
  11. code2flow/analysis/smells.py +192 -0
  12. code2flow/analysis/type_inference.py +306 -0
  13. code2flow/cli.py +493 -0
  14. code2flow/core/__init__.py +36 -0
  15. code2flow/core/analyzer.py +765 -0
  16. code2flow/core/config.py +177 -0
  17. code2flow/core/models.py +194 -0
  18. code2flow/core/streaming_analyzer.py +666 -0
  19. code2flow/exporters/__init__.py +35 -0
  20. code2flow/exporters/base.py +13 -0
  21. code2flow/exporters/context_exporter.py +207 -0
  22. code2flow/exporters/flow_exporter.py +570 -0
  23. code2flow/exporters/json_exporter.py +17 -0
  24. code2flow/exporters/llm_exporter.py +12 -0
  25. code2flow/exporters/map_exporter.py +218 -0
  26. code2flow/exporters/mermaid_exporter.py +67 -0
  27. code2flow/exporters/toon.py +982 -0
  28. code2flow/exporters/yaml_exporter.py +108 -0
  29. code2flow/llm_flow_generator.py +451 -0
  30. code2flow/llm_task_generator.py +263 -0
  31. code2flow/mermaid_generator.py +481 -0
  32. code2flow/nlp/__init__.py +23 -0
  33. code2flow/nlp/config.py +174 -0
  34. code2flow/nlp/entity_resolution.py +326 -0
  35. code2flow/nlp/intent_matching.py +297 -0
  36. code2flow/nlp/normalization.py +122 -0
  37. code2flow/nlp/pipeline.py +388 -0
  38. code2flow/patterns/__init__.py +0 -0
  39. code2flow/patterns/detector.py +168 -0
  40. code2flow/refactor/__init__.py +0 -0
  41. code2flow/refactor/prompt_engine.py +150 -0
  42. code2flow/visualizers/__init__.py +0 -0
  43. code2flow/visualizers/graph.py +196 -0
  44. code2llm-0.3.7.dist-info/METADATA +604 -0
  45. code2llm-0.3.7.dist-info/RECORD +49 -0
  46. code2llm-0.3.7.dist-info/WHEEL +5 -0
  47. code2llm-0.3.7.dist-info/entry_points.txt +2 -0
  48. code2llm-0.3.7.dist-info/licenses/LICENSE +201 -0
  49. code2llm-0.3.7.dist-info/top_level.txt +1 -0
@@ -0,0 +1,481 @@
1
+ """
2
+ Mermaid PNG Generator for code2flow
3
+ Integrates with CLI to auto-generate PNG from Mermaid files.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ import tempfile
10
+ import json
11
+ from pathlib import Path
12
+ from typing import List, Optional
13
+
14
+
15
+ def validate_mermaid_file(mmd_path: Path) -> List[str]:
16
+ """Validate Mermaid file and return list of errors."""
17
+ errors = []
18
+
19
+ if not mmd_path.exists():
20
+ return [f"File not found: {mmd_path}"]
21
+
22
+ try:
23
+ content = mmd_path.read_text(encoding='utf-8')
24
+
25
+ # Basic syntax checks
26
+ lines = content.strip().split('\n')
27
+
28
+ # Check for proper graph declaration
29
+ if not lines or not any(line.strip().startswith(('graph', 'flowchart')) for line in lines):
30
+ errors.append("Missing graph declaration (should start with 'graph' or 'flowchart')")
31
+
32
+ def strip_label_segments(s: str) -> str:
33
+ """Remove label segments that frequently contain Mermaid syntax chars.
34
+
35
+ We ignore bracket/paren balancing inside:
36
+ - edge labels: -->|...|
37
+ - node labels: N1["..."] or N1[/'...'/] etc.
38
+ """
39
+ import re
40
+
41
+ # Remove edge labels |...|
42
+ s = re.sub(r"\|[^|]*\|", "||", s)
43
+
44
+ # Remove common node label forms: ["..."], ("..."), {"..."}
45
+ s = re.sub(r"\[\"[^\"]*\"\]", "[]", s)
46
+ s = re.sub(r"\(\"[^\"]*\"\)", "()", s)
47
+ s = re.sub(r"\{\"[^\"]*\"\}", "{}", s)
48
+
49
+ # Remove Mermaid special bracket label variants like [/'...'/]
50
+ s = re.sub(r"\[/[^\]]*?/\]", "[]", s)
51
+ s = re.sub(r"\(/[^)]*?/\)", "()", s)
52
+
53
+ return s
54
+
55
+ # Check for unmatched brackets/parentheses (outside label segments)
56
+ bracket_stack = []
57
+ paren_stack = []
58
+
59
+ for line_num, line in enumerate(lines, 1):
60
+ line = line.strip()
61
+ if not line or line.startswith('%%'):
62
+ continue
63
+
64
+ # Skip validation for lines that are clearly node definitions with content
65
+ # Node definitions have the pattern: ID[content] or ID(content) or ID{content}
66
+ if (('[' in line and ']' in line) or
67
+ ('(' in line and ')' in line) or
68
+ ('{' in line and '}' in line)):
69
+ # This looks like a node definition, check if it's properly formed
70
+ # but don't count parentheses inside the node content
71
+ continue
72
+
73
+ # Count brackets and parentheses (ignoring those inside label segments)
74
+ check_line = strip_label_segments(line)
75
+ for char in check_line:
76
+ if char == '[':
77
+ bracket_stack.append((']', line_num))
78
+ elif char == ']':
79
+ if not bracket_stack or bracket_stack[-1][0] != ']':
80
+ errors.append(f"Line {line_num}: Unmatched ']'")
81
+ else:
82
+ bracket_stack.pop()
83
+ elif char == '(':
84
+ paren_stack.append((')', line_num))
85
+ elif char == ')':
86
+ if not paren_stack or paren_stack[-1][0] != ')':
87
+ errors.append(f"Line {line_num}: Unmatched ')'")
88
+ else:
89
+ paren_stack.pop()
90
+
91
+ # Report unclosed brackets (only for structural ones, not node content)
92
+ for expected, line_num in bracket_stack:
93
+ errors.append(f"Line {line_num}: Unclosed '[' (missing '{expected}')")
94
+ for expected, line_num in paren_stack:
95
+ errors.append(f"Line {line_num}: Unclosed '(' (missing '{expected}')")
96
+
97
+ # Check for invalid node IDs
98
+ import re
99
+ node_pattern = re.compile(r'^\s*([A-Z]\d+|[Ff]\d+_\w+)\s*["\'\[\{]')
100
+
101
+ for line_num, line in enumerate(lines, 1):
102
+ line = line.strip()
103
+ if not line or line.startswith('%%'):
104
+ continue
105
+
106
+ # Skip subgraph lines for node ID validation
107
+ if line.startswith('subgraph ') or line == 'end':
108
+ continue
109
+
110
+ # Skip validation for lines that are clearly node definitions with content
111
+ # Node definitions have the pattern: ID[content] or ID(content) or ID{content}
112
+ if (('[' in line and ']' in line) or
113
+ ('(' in line and ')' in line) or
114
+ ('{' in line and '}' in line)):
115
+ # This looks like a node definition, check if it's properly formed
116
+ # but don't count parentheses inside the node content
117
+ continue
118
+
119
+ # Check node definitions
120
+ if any(char in line for char in ['[', '(', '{']):
121
+ if not node_pattern.match(line):
122
+ # Try to extract node ID
123
+ match = re.match(r'^\s*([A-Za-z0-9_]+)', line)
124
+ if match:
125
+ node_id = match.group(1)
126
+ if not re.match(r'^[A-Z]\d+$|^[Ff]\d+_\w+$', node_id):
127
+ errors.append(f"Line {line_num}: Invalid node ID '{node_id}' (should be like 'N1' or 'F123_name')")
128
+
129
+ except Exception as e:
130
+ errors.append(f"Error reading file: {e}")
131
+
132
+ return errors
133
+
134
+
135
+ def fix_mermaid_file(mmd_path: Path) -> bool:
136
+ """Attempt to fix common Mermaid syntax errors."""
137
+ try:
138
+ content = mmd_path.read_text(encoding='utf-8')
139
+ lines = content.split('\n')
140
+ fixed_lines = []
141
+
142
+ import re
143
+
144
+ def sanitize_label_text(txt: str) -> str:
145
+ # Mermaid labels frequently break parsing when they contain Mermaid syntax chars.
146
+ # Replace with HTML entities.
147
+ return (
148
+ txt.replace('&', '&')
149
+ .replace('"', '"')
150
+ .replace('[', '[')
151
+ .replace(']', ']')
152
+ .replace('(', '(')
153
+ .replace(')', ')')
154
+ .replace('{', '{')
155
+ .replace('}', '}')
156
+ .replace('|', '|')
157
+ )
158
+
159
+ def sanitize_node_id(node_id: str) -> str:
160
+ """Make a Mermaid-safe node identifier.
161
+
162
+ Mermaid node IDs should avoid characters like '[', ']', '(', ')', '{', '}', '"', '|'.
163
+ For call-graph exports, we only need stable-ish identifiers.
164
+ """
165
+ node_id = (node_id or '').strip()
166
+ # Cut off at first clearly dangerous Mermaid syntax char.
167
+ node_id = re.split(r"[\[\]\(\)\{\}\"\|\s]", node_id, maxsplit=1)[0]
168
+ # Replace remaining non-word chars just in case.
169
+ node_id = re.sub(r"[^A-Za-z0-9_]", "_", node_id)
170
+ return node_id or "N"
171
+
172
+ for line in lines:
173
+ original_line = line
174
+
175
+ # 2. Fix edge labels that might have pipe issues
176
+ if '-->' in line and '|' in line:
177
+ # Handle edge labels like: N1 -->|"label"| N2
178
+ if '-->|' in line:
179
+ parts = line.split('-->|', 1)
180
+ if len(parts) == 2:
181
+ label_and_target = parts[1]
182
+ # Find the closing |
183
+ if '|' in label_and_target:
184
+ parts2 = label_and_target.split('|', 1)
185
+ if len(parts2) == 2:
186
+ label_content, target = parts2
187
+ # Clean up the label content - remove extra pipes if any
188
+ label_content = label_content.strip('|')
189
+ # Fix incomplete parentheses in edge labels
190
+ if label_content.endswith('('):
191
+ label_content = label_content[:-1] # Remove trailing parenthesis
192
+ elif label_content.count('(') > label_content.count(')'):
193
+ # Add missing closing parentheses
194
+ missing_parens = label_content.count('(') - label_content.count(')')
195
+ label_content += ')' * missing_parens
196
+ line = f"{parts[0]}-->|{label_content}|{target}"
197
+
198
+ # 2b. Fix stray trailing '|' after node IDs (common breakage: N123|)
199
+ # Only apply to edge lines to avoid touching other Mermaid constructs.
200
+ if '-->' in line:
201
+ line = re.sub(r"(\b[A-Za-z]\w*)\|\s*$", r"\1", line)
202
+
203
+ # 2c. Sanitize edge label content inside |...|
204
+ # Example bad line: N1 -->|"char == '('"| N2
205
+ def _sanitize_edge_label(m: re.Match) -> str:
206
+ inner = m.group(1)
207
+ return f"|{sanitize_label_text(inner)}|"
208
+
209
+ if '-->' in line and '|' in line:
210
+ line = re.sub(r"\|([^|]{1,200})\|", _sanitize_edge_label, line)
211
+
212
+ # 2d. Sanitize edge endpoints for simple call-graph lines: A --> B
213
+ # This fixes cases where a node ID contains '[' which Mermaid treats as a label start.
214
+ if '-->' in line and '|' not in line:
215
+ m = re.match(r"^(\s*)([^\s-]+)\s*-->\s*([^\s]+)\s*$", line)
216
+ if m:
217
+ indent, src, dst = m.groups()
218
+ src_id = sanitize_node_id(src)
219
+ dst_id = sanitize_node_id(dst)
220
+ line = f"{indent}{src_id} --> {dst_id}"
221
+
222
+ # 3. Fix malformed subgraph IDs
223
+ if line.strip().startswith('subgraph '):
224
+ subgraph_part = line.strip()[9:].split('(', 1)
225
+ if len(subgraph_part) == 2:
226
+ subgraph_id, rest = subgraph_part
227
+ # Clean subgraph ID
228
+ subgraph_id = subgraph_id.replace('.', '_').replace('-', '_').replace(':', '_')
229
+ line = f" subgraph {subgraph_id}({rest}"
230
+
231
+ # 5. Fix class definitions with too many nodes
232
+ if line.strip().startswith('class ') and ',' in line:
233
+ # Split long class lines
234
+ class_parts = line.split(' ', 1)
235
+ if len(class_parts) == 2:
236
+ nodes_and_class = class_parts[1]
237
+ nodes, class_name = nodes_and_class.rsplit(' ', 1)
238
+ node_list = nodes.split(',')
239
+ if len(node_list) > 10: # Split if too many nodes
240
+ # Create multiple lines
241
+ for i in range(0, len(node_list), 10):
242
+ chunk = ','.join(node_list[i:i+10])
243
+ fixed_lines.append(f" class {chunk} {class_name}")
244
+ continue
245
+
246
+ fixed_lines.append(line)
247
+
248
+ # Write back if changed
249
+ fixed_content = '\n'.join(fixed_lines)
250
+ if fixed_content != content:
251
+ mmd_path.write_text(fixed_content, encoding='utf-8')
252
+ return True
253
+
254
+ except Exception as e:
255
+ print(f"Error fixing {mmd_path}: {e}")
256
+
257
+ return False
258
+
259
+
260
+ def generate_pngs(input_dir: Path, output_dir: Path, timeout: int = 60) -> int:
261
+ """Generate PNG files from all .mmd files in input_dir."""
262
+ mmd_files = list(input_dir.glob('*.mmd'))
263
+
264
+ if not mmd_files:
265
+ return 0
266
+
267
+ success_count = 0
268
+
269
+ for mmd_file in mmd_files:
270
+ output_file = output_dir / f"{mmd_file.stem}.png"
271
+
272
+ # Validate first
273
+ errors = validate_mermaid_file(mmd_file)
274
+ if errors:
275
+ print(f" Fixing {mmd_file.name}: {len(errors)} issues")
276
+ fix_mermaid_file(mmd_file)
277
+
278
+ # Re-validate
279
+ errors = validate_mermaid_file(mmd_file)
280
+ if errors:
281
+ print(f" Still has errors: {errors[:3]}") # Show first 3 errors
282
+ continue
283
+
284
+ # Try to generate PNG
285
+ if generate_single_png(mmd_file, output_file, timeout):
286
+ success_count += 1
287
+
288
+ return success_count
289
+
290
+
291
+ def generate_single_png(mmd_file: Path, output_file: Path, timeout: int = 60) -> bool:
292
+ """Generate PNG from single Mermaid file using available renderers."""
293
+
294
+ # Create output directory
295
+ output_file.parent.mkdir(parents=True, exist_ok=True)
296
+
297
+ # Mermaid's default maxTextSize is often too low for large projects,
298
+ # resulting in placeholder PNGs that say "Maximum text size in diagram exceeded".
299
+ # Provide a temporary config with a higher limit.
300
+ try:
301
+ max_text_size = int(os.getenv('CODE2FLOW_MERMAID_MAX_TEXT_SIZE', '2000000'))
302
+ except Exception:
303
+ max_text_size = 2000000
304
+
305
+ try:
306
+ max_edges = int(os.getenv('CODE2FLOW_MERMAID_MAX_EDGES', '20000'))
307
+ except Exception:
308
+ max_edges = 20000
309
+
310
+ cfg_path: Optional[str] = None
311
+ try:
312
+ cfg = {
313
+ "maxTextSize": max_text_size,
314
+ "maxEdges": max_edges,
315
+ "theme": "default",
316
+ }
317
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp_cfg:
318
+ tmp_cfg.write(json.dumps(cfg))
319
+ cfg_path = tmp_cfg.name
320
+
321
+ # Try different renderers in order of preference
322
+ renderers = [
323
+ (
324
+ 'mmdc',
325
+ [
326
+ 'mmdc',
327
+ '-i',
328
+ str(mmd_file),
329
+ '-o',
330
+ str(output_file),
331
+ '-t',
332
+ 'default',
333
+ '-b',
334
+ 'white',
335
+ '-c',
336
+ cfg_path,
337
+ ],
338
+ ),
339
+ (
340
+ 'npx',
341
+ [
342
+ 'npx',
343
+ '-y',
344
+ '@mermaid-js/mermaid-cli',
345
+ '-i',
346
+ str(mmd_file),
347
+ '-o',
348
+ str(output_file),
349
+ '-t',
350
+ 'default',
351
+ '-b',
352
+ 'white',
353
+ '-c',
354
+ cfg_path,
355
+ ],
356
+ ),
357
+ ('puppeteer', None), # Special handling
358
+ ]
359
+ except Exception:
360
+ # If creating config fails for any reason, fall back to renderer defaults.
361
+ renderers = [
362
+ ('mmdc', ['mmdc', '-i', str(mmd_file), '-o', str(output_file), '-t', 'default', '-b', 'white']),
363
+ ('npx', ['npx', '-y', '@mermaid-js/mermaid-cli', '-i', str(mmd_file), '-o', str(output_file)]),
364
+ ('puppeteer', None),
365
+ ]
366
+
367
+ try:
368
+ for renderer_name, cmd in renderers:
369
+ try:
370
+ if renderer_name == 'puppeteer':
371
+ # Special puppeteer handling
372
+ if generate_with_puppeteer(
373
+ mmd_file,
374
+ output_file,
375
+ timeout,
376
+ max_text_size=max_text_size,
377
+ max_edges=max_edges,
378
+ ):
379
+ return True
380
+ continue
381
+
382
+ # Run command
383
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
384
+
385
+ if result.returncode == 0:
386
+ return True
387
+ else:
388
+ print(f" {renderer_name} failed: {result.stderr.strip()}")
389
+
390
+ except subprocess.TimeoutExpired:
391
+ print(f" {renderer_name} timed out")
392
+ except FileNotFoundError:
393
+ print(f" {renderer_name} not available")
394
+ except Exception as e:
395
+ print(f" {renderer_name} error: {e}")
396
+
397
+ return False
398
+ finally:
399
+ if cfg_path:
400
+ try:
401
+ os.unlink(cfg_path)
402
+ except Exception:
403
+ pass
404
+
405
+
406
+ def generate_with_puppeteer(
407
+ mmd_file: Path,
408
+ output_file: Path,
409
+ timeout: int = 60,
410
+ max_text_size: int = 2000000,
411
+ max_edges: int = 20000,
412
+ ) -> bool:
413
+ """Generate PNG using Puppeteer with HTML template."""
414
+ try:
415
+ mmd_content = mmd_file.read_text(encoding='utf-8')
416
+
417
+ html_template = f"""
418
+ <!DOCTYPE html>
419
+ <html>
420
+ <head>
421
+ <meta charset="utf-8">
422
+ <script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
423
+ <style>
424
+ body {{ margin: 0; padding: 20px; background: white; font-family: Arial, sans-serif; }}
425
+ .mermaid {{ max-width: none; }}
426
+ </style>
427
+ </head>
428
+ <body>
429
+ <div class="mermaid">
430
+ {mmd_content}
431
+ </div>
432
+ <script>
433
+ mermaid.initialize({{ startOnLoad: true, theme: 'default', maxTextSize: {max_text_size}, maxEdges: {max_edges} }});
434
+ </script>
435
+ </body>
436
+ </html>
437
+ """
438
+
439
+ # Create temporary HTML
440
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as tmp_html:
441
+ tmp_html.write(html_template)
442
+ tmp_html_path = tmp_html.name
443
+
444
+ try:
445
+ # Use puppeteer screenshot
446
+ cmd = [
447
+ 'npx', '-y', 'puppeteer',
448
+ 'screenshot',
449
+ '--url', f'file://{tmp_html_path}',
450
+ '--output', str(output_file),
451
+ '--wait-for', '.mermaid',
452
+ '--full-page'
453
+ ]
454
+
455
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
456
+
457
+ return result.returncode == 0
458
+
459
+ finally:
460
+ os.unlink(tmp_html_path)
461
+
462
+ except Exception as e:
463
+ print(f" Puppeteer error: {e}")
464
+ return False
465
+
466
+
467
+ if __name__ == '__main__':
468
+ # CLI interface for testing
469
+ import argparse
470
+
471
+ parser = argparse.ArgumentParser(description='Generate PNG from Mermaid files')
472
+ parser.add_argument('input_dir', help='Directory with .mmd files')
473
+ parser.add_argument('output_dir', help='Output directory for PNG files')
474
+
475
+ args = parser.parse_args()
476
+
477
+ input_path = Path(args.input_dir)
478
+ output_path = Path(args.output_dir)
479
+
480
+ count = generate_pngs(input_path, output_path)
481
+ print(f"Generated {count} PNG files")
@@ -0,0 +1,23 @@
1
+ """NLP Processing Pipeline for code2flow.
2
+
3
+ Provides query normalization, intent matching, and entity resolution
4
+ with multilingual support and fuzzy matching.
5
+ """
6
+
7
+ __version__ = "0.2.5"
8
+
9
+ from .pipeline import NLPPipeline
10
+ from .normalization import QueryNormalizer
11
+ from .intent_matching import IntentMatcher
12
+ from .entity_resolution import EntityResolver
13
+ from .config import NLPConfig, FAST_NLP_CONFIG, PRECISE_NLP_CONFIG
14
+
15
+ __all__ = [
16
+ "NLPPipeline",
17
+ "QueryNormalizer",
18
+ "IntentMatcher",
19
+ "EntityResolver",
20
+ "NLPConfig",
21
+ "FAST_NLP_CONFIG",
22
+ "PRECISE_NLP_CONFIG",
23
+ ]
@@ -0,0 +1,174 @@
1
+ """NLP Configuration - YAML-driven settings for NLP pipeline."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Dict, Optional
5
+ from pathlib import Path
6
+ import yaml
7
+
8
+
9
+ @dataclass
10
+ class NormalizationConfig:
11
+ """Configuration for query normalization."""
12
+ # 1a: Lowercase conversion
13
+ lowercase: bool = True
14
+ # 1b: Punctuation removal
15
+ remove_punctuation: bool = True
16
+ # 1c: Whitespace normalization
17
+ normalize_whitespace: bool = True
18
+ # 1d: Unicode normalization (NFKC)
19
+ unicode_normalize: bool = True
20
+ # 1e: Stopword removal
21
+ remove_stopwords: bool = False
22
+ # Language-specific stopwords
23
+ stopwords: Dict[str, List[str]] = field(default_factory=lambda: {
24
+ "en": ["the", "a", "an", "is", "are", "was", "were"],
25
+ "pl": ["w", "z", "do", "na", "jest", "są"],
26
+ })
27
+
28
+
29
+ @dataclass
30
+ class IntentMatchingConfig:
31
+ """Configuration for intent matching."""
32
+ # 2a: Fuzzy matching threshold (0.0-1.0)
33
+ fuzzy_threshold: float = 0.8
34
+ # 2b: Semantic similarity threshold
35
+ semantic_threshold: float = 0.85
36
+ # 2c: Keyword matching weight
37
+ keyword_weight: float = 0.6
38
+ # 2d: Context window size
39
+ context_window: int = 3
40
+ # 2e: Multi-intent resolution strategy
41
+ multi_intent_strategy: str = "best_match" # best_match, combine, sequential
42
+ # Fuzzy matching algorithm
43
+ fuzzy_algorithm: str = "token_sort_ratio" # ratio, partial_ratio, token_sort_ratio
44
+
45
+
46
+ @dataclass
47
+ class EntityResolutionConfig:
48
+ """Configuration for entity resolution."""
49
+ # 3a: Entity types to extract
50
+ entity_types: List[str] = field(default_factory=lambda: [
51
+ "function", "class", "module", "variable", "file"
52
+ ])
53
+ # 3b: Name matching threshold
54
+ name_match_threshold: float = 0.9
55
+ # 3c: Context-aware disambiguation
56
+ context_disambiguation: bool = True
57
+ # 3d: Hierarchical resolution (class.method -> method)
58
+ hierarchical_resolution: bool = True
59
+ # 3e: Alias resolution (short names -> qualified names)
60
+ alias_resolution: bool = True
61
+
62
+
63
+ @dataclass
64
+ class MultilingualConfig:
65
+ """Configuration for multilingual processing."""
66
+ # Supported languages
67
+ languages: List[str] = field(default_factory=lambda: ["en", "pl"])
68
+ # Default language
69
+ default_language: str = "en"
70
+ # Language detection confidence threshold
71
+ lang_detect_threshold: float = 0.7
72
+ # Cross-language matching
73
+ cross_language_matching: bool = True
74
+
75
+
76
+ @dataclass
77
+ class NLPConfig:
78
+ """Main NLP pipeline configuration."""
79
+ # Sub-configurations
80
+ normalization: NormalizationConfig = field(default_factory=NormalizationConfig)
81
+ intent_matching: IntentMatchingConfig = field(default_factory=IntentMatchingConfig)
82
+ entity_resolution: EntityResolutionConfig = field(default_factory=EntityResolutionConfig)
83
+ multilingual: MultilingualConfig = field(default_factory=MultilingualConfig)
84
+
85
+ # Pipeline stages
86
+ enable_normalization: bool = True
87
+ enable_intent_matching: bool = True
88
+ enable_entity_resolution: bool = True
89
+
90
+ # Logging
91
+ verbose: bool = False
92
+
93
+ @classmethod
94
+ def from_yaml(cls, path: str) -> "NLPConfig":
95
+ """Load configuration from YAML file."""
96
+ with open(path, 'r', encoding='utf-8') as f:
97
+ data = yaml.safe_load(f)
98
+
99
+ return cls(
100
+ normalization=NormalizationConfig(**data.get('normalization', {})),
101
+ intent_matching=IntentMatchingConfig(**data.get('intent_matching', {})),
102
+ entity_resolution=EntityResolutionConfig(**data.get('entity_resolution', {})),
103
+ multilingual=MultilingualConfig(**data.get('multilingual', {})),
104
+ enable_normalization=data.get('enable_normalization', True),
105
+ enable_intent_matching=data.get('enable_intent_matching', True),
106
+ enable_entity_resolution=data.get('enable_entity_resolution', True),
107
+ verbose=data.get('verbose', False),
108
+ )
109
+
110
+ def to_yaml(self, path: str) -> None:
111
+ """Save configuration to YAML file."""
112
+ data = {
113
+ 'normalization': self.normalization.__dict__,
114
+ 'intent_matching': self.intent_matching.__dict__,
115
+ 'entity_resolution': self.entity_resolution.__dict__,
116
+ 'multilingual': self.multilingual.__dict__,
117
+ 'enable_normalization': self.enable_normalization,
118
+ 'enable_intent_matching': self.enable_intent_matching,
119
+ 'enable_entity_resolution': self.enable_entity_resolution,
120
+ 'verbose': self.verbose,
121
+ }
122
+
123
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
124
+ with open(path, 'w', encoding='utf-8') as f:
125
+ yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
126
+
127
+
128
+ # Predefined configurations
129
+ FAST_NLP_CONFIG = NLPConfig(
130
+ normalization=NormalizationConfig(
131
+ lowercase=True,
132
+ remove_punctuation=True,
133
+ normalize_whitespace=True,
134
+ unicode_normalize=True,
135
+ remove_stopwords=False,
136
+ ),
137
+ intent_matching=IntentMatchingConfig(
138
+ fuzzy_threshold=0.7, # Lower threshold for speed
139
+ semantic_threshold=0.8,
140
+ keyword_weight=0.8, # Higher weight on keywords
141
+ fuzzy_algorithm="ratio", # Faster algorithm
142
+ ),
143
+ entity_resolution=EntityResolutionConfig(
144
+ entity_types=["function", "class"],
145
+ name_match_threshold=0.85,
146
+ context_disambiguation=False, # Skip for speed
147
+ ),
148
+ verbose=False,
149
+ )
150
+
151
+ PRECISE_NLP_CONFIG = NLPConfig(
152
+ normalization=NormalizationConfig(
153
+ lowercase=True,
154
+ remove_punctuation=True,
155
+ normalize_whitespace=True,
156
+ unicode_normalize=True,
157
+ remove_stopwords=True,
158
+ ),
159
+ intent_matching=IntentMatchingConfig(
160
+ fuzzy_threshold=0.9,
161
+ semantic_threshold=0.95,
162
+ keyword_weight=0.4,
163
+ context_window=5,
164
+ fuzzy_algorithm="token_sort_ratio",
165
+ ),
166
+ entity_resolution=EntityResolutionConfig(
167
+ entity_types=["function", "class", "module", "variable", "file"],
168
+ name_match_threshold=0.95,
169
+ context_disambiguation=True,
170
+ hierarchical_resolution=True,
171
+ alias_resolution=True,
172
+ ),
173
+ verbose=True,
174
+ )