osscodeiq 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. osscodeiq/__init__.py +0 -0
  2. osscodeiq/analyzer.py +467 -0
  3. osscodeiq/cache/__init__.py +0 -0
  4. osscodeiq/cache/hasher.py +23 -0
  5. osscodeiq/cache/store.py +300 -0
  6. osscodeiq/classifiers/__init__.py +0 -0
  7. osscodeiq/classifiers/layer_classifier.py +69 -0
  8. osscodeiq/cli.py +721 -0
  9. osscodeiq/config.py +113 -0
  10. osscodeiq/detectors/__init__.py +0 -0
  11. osscodeiq/detectors/auth/__init__.py +0 -0
  12. osscodeiq/detectors/auth/certificate_auth.py +139 -0
  13. osscodeiq/detectors/auth/ldap_auth.py +89 -0
  14. osscodeiq/detectors/auth/session_header_auth.py +120 -0
  15. osscodeiq/detectors/base.py +41 -0
  16. osscodeiq/detectors/config/__init__.py +0 -0
  17. osscodeiq/detectors/config/batch_structure.py +128 -0
  18. osscodeiq/detectors/config/cloudformation.py +183 -0
  19. osscodeiq/detectors/config/docker_compose.py +179 -0
  20. osscodeiq/detectors/config/github_actions.py +150 -0
  21. osscodeiq/detectors/config/gitlab_ci.py +216 -0
  22. osscodeiq/detectors/config/helm_chart.py +187 -0
  23. osscodeiq/detectors/config/ini_structure.py +101 -0
  24. osscodeiq/detectors/config/json_structure.py +72 -0
  25. osscodeiq/detectors/config/kubernetes.py +305 -0
  26. osscodeiq/detectors/config/kubernetes_rbac.py +212 -0
  27. osscodeiq/detectors/config/openapi.py +194 -0
  28. osscodeiq/detectors/config/package_json.py +99 -0
  29. osscodeiq/detectors/config/properties_detector.py +108 -0
  30. osscodeiq/detectors/config/pyproject_toml.py +169 -0
  31. osscodeiq/detectors/config/sql_structure.py +155 -0
  32. osscodeiq/detectors/config/toml_structure.py +93 -0
  33. osscodeiq/detectors/config/tsconfig_json.py +105 -0
  34. osscodeiq/detectors/config/yaml_structure.py +82 -0
  35. osscodeiq/detectors/cpp/__init__.py +0 -0
  36. osscodeiq/detectors/cpp/cpp_structures.py +192 -0
  37. osscodeiq/detectors/csharp/__init__.py +0 -0
  38. osscodeiq/detectors/csharp/csharp_efcore.py +184 -0
  39. osscodeiq/detectors/csharp/csharp_minimal_apis.py +156 -0
  40. osscodeiq/detectors/csharp/csharp_structures.py +317 -0
  41. osscodeiq/detectors/docs/__init__.py +0 -0
  42. osscodeiq/detectors/docs/markdown_structure.py +117 -0
  43. osscodeiq/detectors/frontend/__init__.py +0 -0
  44. osscodeiq/detectors/frontend/angular_components.py +177 -0
  45. osscodeiq/detectors/frontend/frontend_routes.py +259 -0
  46. osscodeiq/detectors/frontend/react_components.py +148 -0
  47. osscodeiq/detectors/frontend/svelte_components.py +84 -0
  48. osscodeiq/detectors/frontend/vue_components.py +150 -0
  49. osscodeiq/detectors/generic/__init__.py +1 -0
  50. osscodeiq/detectors/generic/imports_detector.py +413 -0
  51. osscodeiq/detectors/go/__init__.py +0 -0
  52. osscodeiq/detectors/go/go_orm.py +202 -0
  53. osscodeiq/detectors/go/go_structures.py +162 -0
  54. osscodeiq/detectors/go/go_web.py +157 -0
  55. osscodeiq/detectors/iac/__init__.py +0 -0
  56. osscodeiq/detectors/iac/bicep.py +135 -0
  57. osscodeiq/detectors/iac/dockerfile.py +182 -0
  58. osscodeiq/detectors/iac/terraform.py +188 -0
  59. osscodeiq/detectors/java/__init__.py +0 -0
  60. osscodeiq/detectors/java/azure_functions.py +424 -0
  61. osscodeiq/detectors/java/azure_messaging.py +350 -0
  62. osscodeiq/detectors/java/class_hierarchy.py +349 -0
  63. osscodeiq/detectors/java/config_def.py +82 -0
  64. osscodeiq/detectors/java/cosmos_db.py +105 -0
  65. osscodeiq/detectors/java/graphql_resolver.py +188 -0
  66. osscodeiq/detectors/java/grpc_service.py +142 -0
  67. osscodeiq/detectors/java/ibm_mq.py +178 -0
  68. osscodeiq/detectors/java/jaxrs.py +160 -0
  69. osscodeiq/detectors/java/jdbc.py +196 -0
  70. osscodeiq/detectors/java/jms.py +116 -0
  71. osscodeiq/detectors/java/jpa_entity.py +143 -0
  72. osscodeiq/detectors/java/kafka.py +113 -0
  73. osscodeiq/detectors/java/kafka_protocol.py +70 -0
  74. osscodeiq/detectors/java/micronaut.py +248 -0
  75. osscodeiq/detectors/java/module_deps.py +191 -0
  76. osscodeiq/detectors/java/public_api.py +206 -0
  77. osscodeiq/detectors/java/quarkus.py +176 -0
  78. osscodeiq/detectors/java/rabbitmq.py +150 -0
  79. osscodeiq/detectors/java/raw_sql.py +136 -0
  80. osscodeiq/detectors/java/repository.py +131 -0
  81. osscodeiq/detectors/java/rmi.py +129 -0
  82. osscodeiq/detectors/java/spring_events.py +117 -0
  83. osscodeiq/detectors/java/spring_rest.py +168 -0
  84. osscodeiq/detectors/java/spring_security.py +212 -0
  85. osscodeiq/detectors/java/tibco_ems.py +193 -0
  86. osscodeiq/detectors/java/websocket.py +188 -0
  87. osscodeiq/detectors/kotlin/__init__.py +0 -0
  88. osscodeiq/detectors/kotlin/kotlin_structures.py +124 -0
  89. osscodeiq/detectors/kotlin/ktor_routes.py +163 -0
  90. osscodeiq/detectors/proto/__init__.py +0 -0
  91. osscodeiq/detectors/proto/proto_structure.py +153 -0
  92. osscodeiq/detectors/python/__init__.py +0 -0
  93. osscodeiq/detectors/python/celery_tasks.py +88 -0
  94. osscodeiq/detectors/python/django_auth.py +132 -0
  95. osscodeiq/detectors/python/django_models.py +157 -0
  96. osscodeiq/detectors/python/django_views.py +74 -0
  97. osscodeiq/detectors/python/fastapi_auth.py +143 -0
  98. osscodeiq/detectors/python/fastapi_routes.py +68 -0
  99. osscodeiq/detectors/python/flask_routes.py +67 -0
  100. osscodeiq/detectors/python/kafka_python.py +175 -0
  101. osscodeiq/detectors/python/pydantic_models.py +115 -0
  102. osscodeiq/detectors/python/python_structures.py +234 -0
  103. osscodeiq/detectors/python/sqlalchemy_models.py +82 -0
  104. osscodeiq/detectors/registry.py +100 -0
  105. osscodeiq/detectors/rust/__init__.py +0 -0
  106. osscodeiq/detectors/rust/actix_web.py +234 -0
  107. osscodeiq/detectors/rust/rust_structures.py +174 -0
  108. osscodeiq/detectors/scala/__init__.py +0 -0
  109. osscodeiq/detectors/scala/scala_structures.py +128 -0
  110. osscodeiq/detectors/shell/__init__.py +0 -0
  111. osscodeiq/detectors/shell/bash_detector.py +127 -0
  112. osscodeiq/detectors/shell/powershell_detector.py +118 -0
  113. osscodeiq/detectors/typescript/__init__.py +0 -0
  114. osscodeiq/detectors/typescript/express_routes.py +55 -0
  115. osscodeiq/detectors/typescript/fastify_routes.py +156 -0
  116. osscodeiq/detectors/typescript/graphql_resolvers.py +100 -0
  117. osscodeiq/detectors/typescript/kafka_js.py +164 -0
  118. osscodeiq/detectors/typescript/mongoose_orm.py +151 -0
  119. osscodeiq/detectors/typescript/nestjs_controllers.py +99 -0
  120. osscodeiq/detectors/typescript/nestjs_guards.py +138 -0
  121. osscodeiq/detectors/typescript/passport_jwt.py +133 -0
  122. osscodeiq/detectors/typescript/prisma_orm.py +96 -0
  123. osscodeiq/detectors/typescript/remix_routes.py +160 -0
  124. osscodeiq/detectors/typescript/sequelize_orm.py +136 -0
  125. osscodeiq/detectors/typescript/typeorm_entities.py +86 -0
  126. osscodeiq/detectors/typescript/typescript_structures.py +185 -0
  127. osscodeiq/detectors/utils.py +49 -0
  128. osscodeiq/discovery/__init__.py +11 -0
  129. osscodeiq/discovery/change_detector.py +97 -0
  130. osscodeiq/discovery/file_discovery.py +342 -0
  131. osscodeiq/flow/__init__.py +0 -0
  132. osscodeiq/flow/engine.py +78 -0
  133. osscodeiq/flow/models.py +72 -0
  134. osscodeiq/flow/renderer.py +127 -0
  135. osscodeiq/flow/templates/interactive.html +252 -0
  136. osscodeiq/flow/vendor/cytoscape-dagre.min.js +8 -0
  137. osscodeiq/flow/vendor/cytoscape.min.js +32 -0
  138. osscodeiq/flow/vendor/dagre.min.js +3809 -0
  139. osscodeiq/flow/views.py +357 -0
  140. osscodeiq/graph/__init__.py +0 -0
  141. osscodeiq/graph/backend.py +52 -0
  142. osscodeiq/graph/backends/__init__.py +23 -0
  143. osscodeiq/graph/backends/kuzu.py +576 -0
  144. osscodeiq/graph/backends/networkx.py +135 -0
  145. osscodeiq/graph/backends/sqlite_backend.py +406 -0
  146. osscodeiq/graph/builder.py +297 -0
  147. osscodeiq/graph/query.py +228 -0
  148. osscodeiq/graph/store.py +183 -0
  149. osscodeiq/graph/views.py +231 -0
  150. osscodeiq/models/__init__.py +17 -0
  151. osscodeiq/models/graph.py +116 -0
  152. osscodeiq/output/__init__.py +0 -0
  153. osscodeiq/output/dot.py +171 -0
  154. osscodeiq/output/mermaid.py +160 -0
  155. osscodeiq/output/safety.py +58 -0
  156. osscodeiq/output/serializers.py +42 -0
  157. osscodeiq/parsing/__init__.py +5 -0
  158. osscodeiq/parsing/languages/__init__.py +0 -0
  159. osscodeiq/parsing/languages/base.py +23 -0
  160. osscodeiq/parsing/languages/java.py +68 -0
  161. osscodeiq/parsing/languages/python.py +57 -0
  162. osscodeiq/parsing/languages/typescript.py +95 -0
  163. osscodeiq/parsing/parser_manager.py +125 -0
  164. osscodeiq/parsing/structured/__init__.py +0 -0
  165. osscodeiq/parsing/structured/gradle_parser.py +78 -0
  166. osscodeiq/parsing/structured/json_parser.py +24 -0
  167. osscodeiq/parsing/structured/properties_parser.py +56 -0
  168. osscodeiq/parsing/structured/sql_parser.py +54 -0
  169. osscodeiq/parsing/structured/xml_parser.py +148 -0
  170. osscodeiq/parsing/structured/yaml_parser.py +38 -0
  171. osscodeiq/server/__init__.py +7 -0
  172. osscodeiq/server/app.py +53 -0
  173. osscodeiq/server/mcp_server.py +174 -0
  174. osscodeiq/server/middleware.py +16 -0
  175. osscodeiq/server/routes.py +184 -0
  176. osscodeiq/server/service.py +445 -0
  177. osscodeiq/server/templates/welcome.html +56 -0
  178. osscodeiq-0.0.0.dist-info/METADATA +30 -0
  179. osscodeiq-0.0.0.dist-info/RECORD +183 -0
  180. osscodeiq-0.0.0.dist-info/WHEEL +5 -0
  181. osscodeiq-0.0.0.dist-info/entry_points.txt +2 -0
  182. osscodeiq-0.0.0.dist-info/licenses/LICENSE +21 -0
  183. osscodeiq-0.0.0.dist-info/top_level.txt +1 -0
osscodeiq/__init__.py ADDED
File without changes
osscodeiq/analyzer.py ADDED
@@ -0,0 +1,467 @@
1
+ """Pipeline orchestrator for OSSCodeIQ analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from osscodeiq.cache.store import CacheStore
12
+ from osscodeiq.config import Config
13
+ from osscodeiq.detectors.base import DetectorContext, DetectorResult
14
+ from osscodeiq.detectors.registry import DetectorRegistry
15
+ from osscodeiq.discovery.change_detector import ChangeDetector
16
+ from osscodeiq.discovery.file_discovery import (
17
+ ChangeType,
18
+ DiscoveredFile,
19
+ FileDiscovery,
20
+ )
21
+ from osscodeiq.graph.builder import GraphBuilder
22
+ from osscodeiq.graph.store import GraphStore
23
+ from osscodeiq.models.graph import GraphEdge, GraphNode
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Languages handled by tree-sitter
28
+ _TREESITTER_LANGUAGES = {"java", "python", "typescript", "javascript"}
29
+
30
+ # Languages handled by structured parsers
31
+ _STRUCTURED_LANGUAGES = {
32
+ "xml", "yaml", "json", "properties", "gradle", "sql",
33
+ "bicep", "terraform", "csharp", "go", "cpp", "c",
34
+ "bash", "powershell", "batch", "ruby", "rust", "kotlin",
35
+ "scala", "swift", "r", "perl", "lua", "dart",
36
+ "dockerfile", "toml", "ini", "dotenv", "csv",
37
+ "vue", "svelte",
38
+ "html", "css", "scss", "less", "razor", "cshtml", "asciidoc",
39
+ "makefile", "gomod", "gosum", "groovy",
40
+ }
41
+
42
+
43
+ @dataclass
44
+ class AnalysisResult:
45
+ """Result of running the full analysis pipeline."""
46
+
47
+ graph: GraphStore
48
+ files_analyzed: int
49
+ files_cached: int
50
+ total_files: int
51
+ language_breakdown: dict[str, int]
52
+ node_breakdown: dict[str, int]
53
+ files_with_detectors: int
54
+ files_without_detectors: int
55
+
56
+
57
+ def _parse_toml(content: bytes, file_path: str) -> dict:
58
+ """Parse TOML content."""
59
+ try:
60
+ import tomllib
61
+ except ModuleNotFoundError:
62
+ import tomli as tomllib # type: ignore[no-redef]
63
+ try:
64
+ text = content.decode("utf-8", errors="replace")
65
+ data = tomllib.loads(text)
66
+ except Exception as exc:
67
+ return {"error": "invalid_toml", "file": file_path, "detail": str(exc)}
68
+ return {"type": "toml", "file": file_path, "data": data}
69
+
70
+
71
+ def _parse_ini(content: bytes, file_path: str) -> dict:
72
+ """Parse INI content."""
73
+ import configparser
74
+ try:
75
+ text = content.decode("utf-8", errors="replace")
76
+ parser = configparser.ConfigParser()
77
+ parser.read_string(text)
78
+ data = {section: dict(parser[section]) for section in parser.sections()}
79
+ except Exception as exc:
80
+ return {"error": "invalid_ini", "file": file_path, "detail": str(exc)}
81
+ return {"type": "ini", "file": file_path, "data": data}
82
+
83
+
84
+ def _text_passthrough(lang: str):
85
+ """Return a parser that passes through raw text for regex-based detection."""
86
+ def _parse(content: bytes, file_path: str) -> dict:
87
+ return {"type": lang, "file": file_path, "data": content.decode("utf-8", errors="replace")}
88
+ return _parse
89
+
90
+
91
+ def _class_parser(module_path: str, class_name: str):
92
+ """Return a parser that lazily imports and delegates to a structured parser class."""
93
+ def _parse(content: bytes, file_path: str):
94
+ mod = __import__(module_path, fromlist=[class_name])
95
+ cls = getattr(mod, class_name)
96
+ return cls().parse(content, file_path)
97
+ return _parse
98
+
99
+
100
+ # Dispatch table for structured parsers. Keyed by language identifier.
101
+ _STRUCTURED_PARSERS: dict[str, Any] = {
102
+ "xml": _class_parser("osscodeiq.parsing.structured.xml_parser", "XmlParser"),
103
+ "yaml": _class_parser("osscodeiq.parsing.structured.yaml_parser", "YamlParser"),
104
+ "json": _class_parser("osscodeiq.parsing.structured.json_parser", "JsonParser"),
105
+ "properties": _class_parser("osscodeiq.parsing.structured.properties_parser", "PropertiesParser"),
106
+ "gradle": _class_parser("osscodeiq.parsing.structured.gradle_parser", "GradleParser"),
107
+ "sql": _class_parser("osscodeiq.parsing.structured.sql_parser", "SqlParser"),
108
+ "toml": _parse_toml,
109
+ "ini": _parse_ini,
110
+ "markdown": _text_passthrough("markdown"),
111
+ "proto": _text_passthrough("proto"),
112
+ "vue": _text_passthrough("vue"),
113
+ "svelte": _text_passthrough("svelte"),
114
+ "html": _text_passthrough("html"),
115
+ "css": _text_passthrough("css"),
116
+ "scss": _text_passthrough("scss"),
117
+ "less": _text_passthrough("less"),
118
+ "razor": _text_passthrough("razor"),
119
+ "cshtml": _text_passthrough("cshtml"),
120
+ "asciidoc": _text_passthrough("asciidoc"),
121
+ "makefile": _text_passthrough("makefile"),
122
+ "gomod": _text_passthrough("gomod"),
123
+ "gosum": _text_passthrough("gosum"),
124
+ "groovy": _text_passthrough("groovy"),
125
+ }
126
+
127
+
128
+ def _parse_structured(language: str, content: bytes, file_path: str) -> Any:
129
+ """Dispatch to the correct structured parser."""
130
+ parser = _STRUCTURED_PARSERS.get(language)
131
+ if parser is not None:
132
+ try:
133
+ return parser(content, file_path)
134
+ except Exception:
135
+ logger.debug("Structured parse failed for %s", file_path, exc_info=True)
136
+ return None
137
+
138
+
139
+ def _analyze_file(
140
+ file: DiscoveredFile,
141
+ repo_path: Path,
142
+ registry: DetectorRegistry,
143
+ parser_manager: Any | None = None,
144
+ ) -> tuple[DiscoveredFile, DetectorResult]:
145
+ """Analyze a single file: read, parse, run detectors.
146
+
147
+ This function is designed to be called from worker threads.
148
+ Tree-sitter releases the GIL during parsing, so ThreadPoolExecutor
149
+ gives real parallelism for the parse step.
150
+ """
151
+ abs_path = repo_path / file.path
152
+ try:
153
+ content = abs_path.read_bytes()
154
+ except OSError:
155
+ logger.warning("Could not read file %s", abs_path)
156
+ return file, DetectorResult()
157
+
158
+ tree = None
159
+ parsed_data = None
160
+
161
+ # Tree-sitter parse for supported languages
162
+ if parser_manager is not None and file.language in _TREESITTER_LANGUAGES:
163
+ try:
164
+ tree = parser_manager.parse_file(file, content)
165
+ except Exception:
166
+ logger.debug("Tree-sitter parse failed for %s", file.path, exc_info=True)
167
+
168
+ # Structured file parsing
169
+ if file.language in _STRUCTURED_LANGUAGES:
170
+ try:
171
+ parsed_data = _parse_structured(file.language, content, str(file.path))
172
+ except Exception:
173
+ logger.debug("Structured parse failed for %s", file.path, exc_info=True)
174
+
175
+ module_name = _derive_module_name(file.path, file.language)
176
+
177
+ ctx = DetectorContext(
178
+ file_path=str(file.path),
179
+ language=file.language,
180
+ content=content,
181
+ tree=tree,
182
+ parsed_data=parsed_data,
183
+ module_name=module_name,
184
+ )
185
+
186
+ merged = DetectorResult()
187
+ for detector in registry.detectors_for_language(file.language):
188
+ try:
189
+ result = detector.detect(ctx)
190
+ merged.nodes.extend(result.nodes)
191
+ merged.edges.extend(result.edges)
192
+ except Exception:
193
+ logger.warning(
194
+ "Detector %s failed on %s",
195
+ detector.name,
196
+ file.path,
197
+ exc_info=True,
198
+ )
199
+
200
+ return file, merged
201
+
202
+
203
+ def _derive_module_name(path: Path, language: str) -> str | None:
204
+ """Best-effort module name from file path."""
205
+ parts = path.parts
206
+ joined = "/".join(parts)
207
+
208
+ if language == "java":
209
+ for marker in ("src/main/java/", "src/test/java/"):
210
+ if marker in joined:
211
+ idx = joined.index(marker) + len(marker)
212
+ remainder = joined[idx:]
213
+ pkg = remainder.rsplit("/", 1)[0] if "/" in remainder else ""
214
+ return pkg.replace("/", ".") if pkg else None
215
+ return None
216
+
217
+ if language == "python":
218
+ parent = path.parent
219
+ if str(parent) == ".":
220
+ return None
221
+ return str(parent).replace("/", ".").replace("\\", ".")
222
+
223
+ # For XML/YAML/etc., use parent directory as module name
224
+ if language in _STRUCTURED_LANGUAGES:
225
+ parent = path.parent
226
+ if str(parent) == ".":
227
+ return None
228
+ return str(parent).replace("/", ".").replace("\\", ".")
229
+
230
+ return None
231
+
232
+
233
+ class Analyzer:
234
+ """Orchestrates the full OSSCodeIQ analysis pipeline.
235
+
236
+ Steps:
237
+ 1. Discover files (FileDiscovery)
238
+ 2. If incremental, detect changed files and load cached results for unchanged
239
+ 3. Parse and run detectors on changed/new files
240
+ 4. Aggregate results in GraphBuilder
241
+ 5. Run cross-file linkers
242
+ 6. Cache new results
243
+ 7. Return AnalysisResult
244
+ """
245
+
246
+ def __init__(self, config: Config | None = None) -> None:
247
+ self._config = config or Config()
248
+ self._registry = DetectorRegistry()
249
+ self._registry.load_builtin_detectors()
250
+ self._registry.load_plugin_detectors()
251
+
252
+ # Create ParserManager once (thread-safe via internal pool)
253
+ self._parser_manager = None
254
+ try:
255
+ from osscodeiq.parsing.parser_manager import ParserManager
256
+ self._parser_manager = ParserManager()
257
+ except Exception:
258
+ logger.warning("ParserManager unavailable, tree-sitter parsing disabled", exc_info=True)
259
+
260
+ def run(
261
+ self,
262
+ repo_path: Path,
263
+ incremental: bool = True,
264
+ on_progress: Any | None = None,
265
+ ) -> AnalysisResult:
266
+ """Execute the analysis pipeline on *repo_path*.
267
+
268
+ *on_progress*, when provided, is called with a status string at
269
+ each major pipeline milestone.
270
+ """
271
+ def _report(msg: str) -> None:
272
+ if on_progress is not None:
273
+ on_progress(msg)
274
+
275
+ repo_path = repo_path.resolve()
276
+
277
+ # ----------------------------------------------------------
278
+ # 1. Discover files
279
+ # ----------------------------------------------------------
280
+ _report("🔍 Discovering files…")
281
+ discovery = FileDiscovery(self._config)
282
+ all_files = discovery.discover(repo_path)
283
+ current_commit = discovery.current_commit
284
+ total_files = len(all_files)
285
+
286
+ # Compute language breakdown and detector coverage
287
+ language_breakdown: dict[str, int] = {}
288
+ files_with_detectors = 0
289
+ files_without_detectors = 0
290
+ for f in all_files:
291
+ language_breakdown[f.language] = language_breakdown.get(f.language, 0) + 1
292
+ if self._registry.detectors_for_language(f.language):
293
+ files_with_detectors += 1
294
+ else:
295
+ files_without_detectors += 1
296
+
297
+ _report(f"📁 Found {total_files} files")
298
+ logger.info("Discovered %d files in %s", total_files, repo_path)
299
+
300
+ # ----------------------------------------------------------
301
+ # 2. Determine which files need (re-)analysis
302
+ # ----------------------------------------------------------
303
+ cache_cfg = self._config.cache
304
+ cache: CacheStore | None = None
305
+ files_to_analyze: list[DiscoveredFile] = all_files
306
+ files_cached = 0
307
+
308
+ from osscodeiq.graph.backends import create_backend
309
+ # Ensure parent directory exists for file-based backends
310
+ graph_path = self._config.graph.path
311
+ if graph_path:
312
+ Path(graph_path).parent.mkdir(parents=True, exist_ok=True)
313
+ backend = create_backend(self._config.graph.backend, path=graph_path)
314
+ builder = GraphBuilder(backend=backend)
315
+
316
+ if cache_cfg.enabled:
317
+ cache_path = repo_path / cache_cfg.directory / cache_cfg.db_name
318
+ cache = CacheStore(cache_path)
319
+
320
+ if incremental and cache is not None:
321
+ last_commit = cache.get_last_commit()
322
+
323
+ # Use ChangeDetector to find deleted files and purge stale cache
324
+ if last_commit and current_commit and last_commit != current_commit:
325
+ try:
326
+ change_detector = ChangeDetector()
327
+ changes = change_detector.detect_changes(repo_path, last_commit)
328
+ for changed in changes:
329
+ if changed.change_type == ChangeType.DELETED:
330
+ cache.remove_by_path(str(changed.path))
331
+ elif changed.change_type == ChangeType.MODIFIED:
332
+ cache.remove_by_path(str(changed.path))
333
+ except Exception:
334
+ logger.debug("ChangeDetector failed, falling back to hash-based", exc_info=True)
335
+
336
+ # Partition files into cached vs needs-analysis
337
+ files_to_analyze = []
338
+ for f in all_files:
339
+ if cache.is_cached(f.content_hash):
340
+ nodes, edges = cache.load_cached_results(f.content_hash)
341
+ builder.add_nodes(nodes)
342
+ builder.add_edges(edges)
343
+ files_cached += 1
344
+ else:
345
+ files_to_analyze.append(f)
346
+
347
+ _report(f"💾 {files_cached} cached, {len(files_to_analyze)} to analyze")
348
+ logger.info(
349
+ "Incremental: %d cached, %d to analyze",
350
+ files_cached,
351
+ len(files_to_analyze),
352
+ )
353
+
354
+ files_analyzed = len(files_to_analyze)
355
+
356
+ # ----------------------------------------------------------
357
+ # 3 & 4. Parse and run detectors
358
+ # ----------------------------------------------------------
359
+ if files_to_analyze:
360
+ _report(f"⚙️ Analyzing {files_analyzed} files…")
361
+ parallelism = self._config.analysis.parallelism
362
+
363
+ pm = self._parser_manager
364
+
365
+ if parallelism <= 1 or len(files_to_analyze) <= 1:
366
+ results = [
367
+ _analyze_file(f, repo_path, self._registry, pm)
368
+ for f in files_to_analyze
369
+ ]
370
+ else:
371
+ max_workers = min(parallelism, len(files_to_analyze))
372
+ # Use a list aligned with files_to_analyze to preserve
373
+ # deterministic ordering regardless of thread completion order.
374
+ result_slots: list[tuple[DiscoveredFile, DetectorResult] | None] = [None] * len(files_to_analyze)
375
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
376
+ futures = {
377
+ executor.submit(
378
+ _analyze_file, f, repo_path, self._registry, pm
379
+ ): idx
380
+ for idx, f in enumerate(files_to_analyze)
381
+ }
382
+ for future in as_completed(futures):
383
+ idx = futures[future]
384
+ try:
385
+ result_slots[idx] = future.result()
386
+ except Exception:
387
+ logger.warning(
388
+ "Analysis failed for %s",
389
+ files_to_analyze[idx].path,
390
+ exc_info=True,
391
+ )
392
+ results = [r for r in result_slots if r is not None]
393
+
394
+ # ----------------------------------------------------------
395
+ # 5. Aggregate results into graph builder
396
+ # ----------------------------------------------------------
397
+ for file, detector_result in results:
398
+ builder.merge_detector_result(detector_result)
399
+
400
+ # Cache new results
401
+ if cache is not None:
402
+ try:
403
+ cache.store_results(
404
+ content_hash=file.content_hash,
405
+ file_path=str(file.path),
406
+ language=file.language,
407
+ nodes=detector_result.nodes,
408
+ edges=detector_result.edges,
409
+ )
410
+ except Exception:
411
+ logger.warning(
412
+ "Failed to cache results for %s",
413
+ file.path,
414
+ exc_info=True,
415
+ )
416
+
417
+ # ----------------------------------------------------------
418
+ # 6. Run cross-file linkers
419
+ # ----------------------------------------------------------
420
+ _report("🔗 Linking cross-file relationships…")
421
+ builder.run_linkers()
422
+
423
+ # ----------------------------------------------------------
424
+ # 6b. Classify layers (after linkers so all nodes are covered)
425
+ # ----------------------------------------------------------
426
+ from osscodeiq.classifiers.layer_classifier import LayerClassifier
427
+ LayerClassifier().classify_store(builder._store)
428
+
429
+ # ----------------------------------------------------------
430
+ # 7. Record run and return result
431
+ # ----------------------------------------------------------
432
+ if cache is not None:
433
+ try:
434
+ cache.record_run(
435
+ commit_sha=current_commit or "",
436
+ file_count=total_files,
437
+ )
438
+ except Exception:
439
+ logger.warning("Failed to record analysis run", exc_info=True)
440
+ finally:
441
+ cache.close()
442
+
443
+ graph = builder.build()
444
+
445
+ # Compute node breakdown
446
+ node_breakdown: dict[str, int] = {}
447
+ for node in graph.all_nodes():
448
+ kind = node.kind.value
449
+ node_breakdown[kind] = node_breakdown.get(kind, 0) + 1
450
+
451
+ _report(f"✅ Analysis complete — {graph.node_count} nodes, {graph.edge_count} edges")
452
+ logger.info(
453
+ "Analysis complete: %d nodes, %d edges",
454
+ graph.node_count,
455
+ graph.edge_count,
456
+ )
457
+
458
+ return AnalysisResult(
459
+ graph=graph,
460
+ files_analyzed=files_analyzed,
461
+ files_cached=files_cached,
462
+ total_files=total_files,
463
+ language_breakdown=language_breakdown,
464
+ node_breakdown=node_breakdown,
465
+ files_with_detectors=files_with_detectors,
466
+ files_without_detectors=files_without_detectors,
467
+ )
File without changes
@@ -0,0 +1,23 @@
1
+ """Content hashing utilities for cache invalidation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+
8
+
9
+ def hash_file_content(content: bytes) -> str:
10
+ """Return the SHA-256 hex digest of *content*."""
11
+ return hashlib.sha256(content).hexdigest()
12
+
13
+
14
+ def hash_file(path: Path) -> str:
15
+ """Read *path* and return its SHA-256 hex digest.
16
+
17
+ Reads in 8 KiB chunks to handle large files efficiently.
18
+ """
19
+ h = hashlib.sha256()
20
+ with open(path, "rb") as f:
21
+ for chunk in iter(lambda: f.read(8192), b""):
22
+ h.update(chunk)
23
+ return h.hexdigest()