osscodeiq 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osscodeiq/__init__.py +0 -0
- osscodeiq/analyzer.py +467 -0
- osscodeiq/cache/__init__.py +0 -0
- osscodeiq/cache/hasher.py +23 -0
- osscodeiq/cache/store.py +300 -0
- osscodeiq/classifiers/__init__.py +0 -0
- osscodeiq/classifiers/layer_classifier.py +69 -0
- osscodeiq/cli.py +721 -0
- osscodeiq/config.py +113 -0
- osscodeiq/detectors/__init__.py +0 -0
- osscodeiq/detectors/auth/__init__.py +0 -0
- osscodeiq/detectors/auth/certificate_auth.py +139 -0
- osscodeiq/detectors/auth/ldap_auth.py +89 -0
- osscodeiq/detectors/auth/session_header_auth.py +120 -0
- osscodeiq/detectors/base.py +41 -0
- osscodeiq/detectors/config/__init__.py +0 -0
- osscodeiq/detectors/config/batch_structure.py +128 -0
- osscodeiq/detectors/config/cloudformation.py +183 -0
- osscodeiq/detectors/config/docker_compose.py +179 -0
- osscodeiq/detectors/config/github_actions.py +150 -0
- osscodeiq/detectors/config/gitlab_ci.py +216 -0
- osscodeiq/detectors/config/helm_chart.py +187 -0
- osscodeiq/detectors/config/ini_structure.py +101 -0
- osscodeiq/detectors/config/json_structure.py +72 -0
- osscodeiq/detectors/config/kubernetes.py +305 -0
- osscodeiq/detectors/config/kubernetes_rbac.py +212 -0
- osscodeiq/detectors/config/openapi.py +194 -0
- osscodeiq/detectors/config/package_json.py +99 -0
- osscodeiq/detectors/config/properties_detector.py +108 -0
- osscodeiq/detectors/config/pyproject_toml.py +169 -0
- osscodeiq/detectors/config/sql_structure.py +155 -0
- osscodeiq/detectors/config/toml_structure.py +93 -0
- osscodeiq/detectors/config/tsconfig_json.py +105 -0
- osscodeiq/detectors/config/yaml_structure.py +82 -0
- osscodeiq/detectors/cpp/__init__.py +0 -0
- osscodeiq/detectors/cpp/cpp_structures.py +192 -0
- osscodeiq/detectors/csharp/__init__.py +0 -0
- osscodeiq/detectors/csharp/csharp_efcore.py +184 -0
- osscodeiq/detectors/csharp/csharp_minimal_apis.py +156 -0
- osscodeiq/detectors/csharp/csharp_structures.py +317 -0
- osscodeiq/detectors/docs/__init__.py +0 -0
- osscodeiq/detectors/docs/markdown_structure.py +117 -0
- osscodeiq/detectors/frontend/__init__.py +0 -0
- osscodeiq/detectors/frontend/angular_components.py +177 -0
- osscodeiq/detectors/frontend/frontend_routes.py +259 -0
- osscodeiq/detectors/frontend/react_components.py +148 -0
- osscodeiq/detectors/frontend/svelte_components.py +84 -0
- osscodeiq/detectors/frontend/vue_components.py +150 -0
- osscodeiq/detectors/generic/__init__.py +1 -0
- osscodeiq/detectors/generic/imports_detector.py +413 -0
- osscodeiq/detectors/go/__init__.py +0 -0
- osscodeiq/detectors/go/go_orm.py +202 -0
- osscodeiq/detectors/go/go_structures.py +162 -0
- osscodeiq/detectors/go/go_web.py +157 -0
- osscodeiq/detectors/iac/__init__.py +0 -0
- osscodeiq/detectors/iac/bicep.py +135 -0
- osscodeiq/detectors/iac/dockerfile.py +182 -0
- osscodeiq/detectors/iac/terraform.py +188 -0
- osscodeiq/detectors/java/__init__.py +0 -0
- osscodeiq/detectors/java/azure_functions.py +424 -0
- osscodeiq/detectors/java/azure_messaging.py +350 -0
- osscodeiq/detectors/java/class_hierarchy.py +349 -0
- osscodeiq/detectors/java/config_def.py +82 -0
- osscodeiq/detectors/java/cosmos_db.py +105 -0
- osscodeiq/detectors/java/graphql_resolver.py +188 -0
- osscodeiq/detectors/java/grpc_service.py +142 -0
- osscodeiq/detectors/java/ibm_mq.py +178 -0
- osscodeiq/detectors/java/jaxrs.py +160 -0
- osscodeiq/detectors/java/jdbc.py +196 -0
- osscodeiq/detectors/java/jms.py +116 -0
- osscodeiq/detectors/java/jpa_entity.py +143 -0
- osscodeiq/detectors/java/kafka.py +113 -0
- osscodeiq/detectors/java/kafka_protocol.py +70 -0
- osscodeiq/detectors/java/micronaut.py +248 -0
- osscodeiq/detectors/java/module_deps.py +191 -0
- osscodeiq/detectors/java/public_api.py +206 -0
- osscodeiq/detectors/java/quarkus.py +176 -0
- osscodeiq/detectors/java/rabbitmq.py +150 -0
- osscodeiq/detectors/java/raw_sql.py +136 -0
- osscodeiq/detectors/java/repository.py +131 -0
- osscodeiq/detectors/java/rmi.py +129 -0
- osscodeiq/detectors/java/spring_events.py +117 -0
- osscodeiq/detectors/java/spring_rest.py +168 -0
- osscodeiq/detectors/java/spring_security.py +212 -0
- osscodeiq/detectors/java/tibco_ems.py +193 -0
- osscodeiq/detectors/java/websocket.py +188 -0
- osscodeiq/detectors/kotlin/__init__.py +0 -0
- osscodeiq/detectors/kotlin/kotlin_structures.py +124 -0
- osscodeiq/detectors/kotlin/ktor_routes.py +163 -0
- osscodeiq/detectors/proto/__init__.py +0 -0
- osscodeiq/detectors/proto/proto_structure.py +153 -0
- osscodeiq/detectors/python/__init__.py +0 -0
- osscodeiq/detectors/python/celery_tasks.py +88 -0
- osscodeiq/detectors/python/django_auth.py +132 -0
- osscodeiq/detectors/python/django_models.py +157 -0
- osscodeiq/detectors/python/django_views.py +74 -0
- osscodeiq/detectors/python/fastapi_auth.py +143 -0
- osscodeiq/detectors/python/fastapi_routes.py +68 -0
- osscodeiq/detectors/python/flask_routes.py +67 -0
- osscodeiq/detectors/python/kafka_python.py +175 -0
- osscodeiq/detectors/python/pydantic_models.py +115 -0
- osscodeiq/detectors/python/python_structures.py +234 -0
- osscodeiq/detectors/python/sqlalchemy_models.py +82 -0
- osscodeiq/detectors/registry.py +100 -0
- osscodeiq/detectors/rust/__init__.py +0 -0
- osscodeiq/detectors/rust/actix_web.py +234 -0
- osscodeiq/detectors/rust/rust_structures.py +174 -0
- osscodeiq/detectors/scala/__init__.py +0 -0
- osscodeiq/detectors/scala/scala_structures.py +128 -0
- osscodeiq/detectors/shell/__init__.py +0 -0
- osscodeiq/detectors/shell/bash_detector.py +127 -0
- osscodeiq/detectors/shell/powershell_detector.py +118 -0
- osscodeiq/detectors/typescript/__init__.py +0 -0
- osscodeiq/detectors/typescript/express_routes.py +55 -0
- osscodeiq/detectors/typescript/fastify_routes.py +156 -0
- osscodeiq/detectors/typescript/graphql_resolvers.py +100 -0
- osscodeiq/detectors/typescript/kafka_js.py +164 -0
- osscodeiq/detectors/typescript/mongoose_orm.py +151 -0
- osscodeiq/detectors/typescript/nestjs_controllers.py +99 -0
- osscodeiq/detectors/typescript/nestjs_guards.py +138 -0
- osscodeiq/detectors/typescript/passport_jwt.py +133 -0
- osscodeiq/detectors/typescript/prisma_orm.py +96 -0
- osscodeiq/detectors/typescript/remix_routes.py +160 -0
- osscodeiq/detectors/typescript/sequelize_orm.py +136 -0
- osscodeiq/detectors/typescript/typeorm_entities.py +86 -0
- osscodeiq/detectors/typescript/typescript_structures.py +185 -0
- osscodeiq/detectors/utils.py +49 -0
- osscodeiq/discovery/__init__.py +11 -0
- osscodeiq/discovery/change_detector.py +97 -0
- osscodeiq/discovery/file_discovery.py +342 -0
- osscodeiq/flow/__init__.py +0 -0
- osscodeiq/flow/engine.py +78 -0
- osscodeiq/flow/models.py +72 -0
- osscodeiq/flow/renderer.py +127 -0
- osscodeiq/flow/templates/interactive.html +252 -0
- osscodeiq/flow/vendor/cytoscape-dagre.min.js +8 -0
- osscodeiq/flow/vendor/cytoscape.min.js +32 -0
- osscodeiq/flow/vendor/dagre.min.js +3809 -0
- osscodeiq/flow/views.py +357 -0
- osscodeiq/graph/__init__.py +0 -0
- osscodeiq/graph/backend.py +52 -0
- osscodeiq/graph/backends/__init__.py +23 -0
- osscodeiq/graph/backends/kuzu.py +576 -0
- osscodeiq/graph/backends/networkx.py +135 -0
- osscodeiq/graph/backends/sqlite_backend.py +406 -0
- osscodeiq/graph/builder.py +297 -0
- osscodeiq/graph/query.py +228 -0
- osscodeiq/graph/store.py +183 -0
- osscodeiq/graph/views.py +231 -0
- osscodeiq/models/__init__.py +17 -0
- osscodeiq/models/graph.py +116 -0
- osscodeiq/output/__init__.py +0 -0
- osscodeiq/output/dot.py +171 -0
- osscodeiq/output/mermaid.py +160 -0
- osscodeiq/output/safety.py +58 -0
- osscodeiq/output/serializers.py +42 -0
- osscodeiq/parsing/__init__.py +5 -0
- osscodeiq/parsing/languages/__init__.py +0 -0
- osscodeiq/parsing/languages/base.py +23 -0
- osscodeiq/parsing/languages/java.py +68 -0
- osscodeiq/parsing/languages/python.py +57 -0
- osscodeiq/parsing/languages/typescript.py +95 -0
- osscodeiq/parsing/parser_manager.py +125 -0
- osscodeiq/parsing/structured/__init__.py +0 -0
- osscodeiq/parsing/structured/gradle_parser.py +78 -0
- osscodeiq/parsing/structured/json_parser.py +24 -0
- osscodeiq/parsing/structured/properties_parser.py +56 -0
- osscodeiq/parsing/structured/sql_parser.py +54 -0
- osscodeiq/parsing/structured/xml_parser.py +148 -0
- osscodeiq/parsing/structured/yaml_parser.py +38 -0
- osscodeiq/server/__init__.py +7 -0
- osscodeiq/server/app.py +53 -0
- osscodeiq/server/mcp_server.py +174 -0
- osscodeiq/server/middleware.py +16 -0
- osscodeiq/server/routes.py +184 -0
- osscodeiq/server/service.py +445 -0
- osscodeiq/server/templates/welcome.html +56 -0
- osscodeiq-0.0.0.dist-info/METADATA +30 -0
- osscodeiq-0.0.0.dist-info/RECORD +183 -0
- osscodeiq-0.0.0.dist-info/WHEEL +5 -0
- osscodeiq-0.0.0.dist-info/entry_points.txt +2 -0
- osscodeiq-0.0.0.dist-info/licenses/LICENSE +21 -0
- osscodeiq-0.0.0.dist-info/top_level.txt +1 -0
osscodeiq/__init__.py
ADDED
|
File without changes
|
osscodeiq/analyzer.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
"""Pipeline orchestrator for OSSCodeIQ analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from osscodeiq.cache.store import CacheStore
|
|
12
|
+
from osscodeiq.config import Config
|
|
13
|
+
from osscodeiq.detectors.base import DetectorContext, DetectorResult
|
|
14
|
+
from osscodeiq.detectors.registry import DetectorRegistry
|
|
15
|
+
from osscodeiq.discovery.change_detector import ChangeDetector
|
|
16
|
+
from osscodeiq.discovery.file_discovery import (
|
|
17
|
+
ChangeType,
|
|
18
|
+
DiscoveredFile,
|
|
19
|
+
FileDiscovery,
|
|
20
|
+
)
|
|
21
|
+
from osscodeiq.graph.builder import GraphBuilder
|
|
22
|
+
from osscodeiq.graph.store import GraphStore
|
|
23
|
+
from osscodeiq.models.graph import GraphEdge, GraphNode
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# Languages handled by tree-sitter
|
|
28
|
+
_TREESITTER_LANGUAGES = {"java", "python", "typescript", "javascript"}
|
|
29
|
+
|
|
30
|
+
# Languages handled by structured parsers
|
|
31
|
+
_STRUCTURED_LANGUAGES = {
|
|
32
|
+
"xml", "yaml", "json", "properties", "gradle", "sql",
|
|
33
|
+
"bicep", "terraform", "csharp", "go", "cpp", "c",
|
|
34
|
+
"bash", "powershell", "batch", "ruby", "rust", "kotlin",
|
|
35
|
+
"scala", "swift", "r", "perl", "lua", "dart",
|
|
36
|
+
"dockerfile", "toml", "ini", "dotenv", "csv",
|
|
37
|
+
"vue", "svelte",
|
|
38
|
+
"html", "css", "scss", "less", "razor", "cshtml", "asciidoc",
|
|
39
|
+
"makefile", "gomod", "gosum", "groovy",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class AnalysisResult:
|
|
45
|
+
"""Result of running the full analysis pipeline."""
|
|
46
|
+
|
|
47
|
+
graph: GraphStore
|
|
48
|
+
files_analyzed: int
|
|
49
|
+
files_cached: int
|
|
50
|
+
total_files: int
|
|
51
|
+
language_breakdown: dict[str, int]
|
|
52
|
+
node_breakdown: dict[str, int]
|
|
53
|
+
files_with_detectors: int
|
|
54
|
+
files_without_detectors: int
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _parse_toml(content: bytes, file_path: str) -> dict:
|
|
58
|
+
"""Parse TOML content."""
|
|
59
|
+
try:
|
|
60
|
+
import tomllib
|
|
61
|
+
except ModuleNotFoundError:
|
|
62
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
63
|
+
try:
|
|
64
|
+
text = content.decode("utf-8", errors="replace")
|
|
65
|
+
data = tomllib.loads(text)
|
|
66
|
+
except Exception as exc:
|
|
67
|
+
return {"error": "invalid_toml", "file": file_path, "detail": str(exc)}
|
|
68
|
+
return {"type": "toml", "file": file_path, "data": data}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _parse_ini(content: bytes, file_path: str) -> dict:
|
|
72
|
+
"""Parse INI content."""
|
|
73
|
+
import configparser
|
|
74
|
+
try:
|
|
75
|
+
text = content.decode("utf-8", errors="replace")
|
|
76
|
+
parser = configparser.ConfigParser()
|
|
77
|
+
parser.read_string(text)
|
|
78
|
+
data = {section: dict(parser[section]) for section in parser.sections()}
|
|
79
|
+
except Exception as exc:
|
|
80
|
+
return {"error": "invalid_ini", "file": file_path, "detail": str(exc)}
|
|
81
|
+
return {"type": "ini", "file": file_path, "data": data}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _text_passthrough(lang: str):
|
|
85
|
+
"""Return a parser that passes through raw text for regex-based detection."""
|
|
86
|
+
def _parse(content: bytes, file_path: str) -> dict:
|
|
87
|
+
return {"type": lang, "file": file_path, "data": content.decode("utf-8", errors="replace")}
|
|
88
|
+
return _parse
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _class_parser(module_path: str, class_name: str):
|
|
92
|
+
"""Return a parser that lazily imports and delegates to a structured parser class."""
|
|
93
|
+
def _parse(content: bytes, file_path: str):
|
|
94
|
+
mod = __import__(module_path, fromlist=[class_name])
|
|
95
|
+
cls = getattr(mod, class_name)
|
|
96
|
+
return cls().parse(content, file_path)
|
|
97
|
+
return _parse
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# Dispatch table for structured parsers. Keyed by language identifier.
|
|
101
|
+
_STRUCTURED_PARSERS: dict[str, Any] = {
|
|
102
|
+
"xml": _class_parser("osscodeiq.parsing.structured.xml_parser", "XmlParser"),
|
|
103
|
+
"yaml": _class_parser("osscodeiq.parsing.structured.yaml_parser", "YamlParser"),
|
|
104
|
+
"json": _class_parser("osscodeiq.parsing.structured.json_parser", "JsonParser"),
|
|
105
|
+
"properties": _class_parser("osscodeiq.parsing.structured.properties_parser", "PropertiesParser"),
|
|
106
|
+
"gradle": _class_parser("osscodeiq.parsing.structured.gradle_parser", "GradleParser"),
|
|
107
|
+
"sql": _class_parser("osscodeiq.parsing.structured.sql_parser", "SqlParser"),
|
|
108
|
+
"toml": _parse_toml,
|
|
109
|
+
"ini": _parse_ini,
|
|
110
|
+
"markdown": _text_passthrough("markdown"),
|
|
111
|
+
"proto": _text_passthrough("proto"),
|
|
112
|
+
"vue": _text_passthrough("vue"),
|
|
113
|
+
"svelte": _text_passthrough("svelte"),
|
|
114
|
+
"html": _text_passthrough("html"),
|
|
115
|
+
"css": _text_passthrough("css"),
|
|
116
|
+
"scss": _text_passthrough("scss"),
|
|
117
|
+
"less": _text_passthrough("less"),
|
|
118
|
+
"razor": _text_passthrough("razor"),
|
|
119
|
+
"cshtml": _text_passthrough("cshtml"),
|
|
120
|
+
"asciidoc": _text_passthrough("asciidoc"),
|
|
121
|
+
"makefile": _text_passthrough("makefile"),
|
|
122
|
+
"gomod": _text_passthrough("gomod"),
|
|
123
|
+
"gosum": _text_passthrough("gosum"),
|
|
124
|
+
"groovy": _text_passthrough("groovy"),
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _parse_structured(language: str, content: bytes, file_path: str) -> Any:
|
|
129
|
+
"""Dispatch to the correct structured parser."""
|
|
130
|
+
parser = _STRUCTURED_PARSERS.get(language)
|
|
131
|
+
if parser is not None:
|
|
132
|
+
try:
|
|
133
|
+
return parser(content, file_path)
|
|
134
|
+
except Exception:
|
|
135
|
+
logger.debug("Structured parse failed for %s", file_path, exc_info=True)
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _analyze_file(
|
|
140
|
+
file: DiscoveredFile,
|
|
141
|
+
repo_path: Path,
|
|
142
|
+
registry: DetectorRegistry,
|
|
143
|
+
parser_manager: Any | None = None,
|
|
144
|
+
) -> tuple[DiscoveredFile, DetectorResult]:
|
|
145
|
+
"""Analyze a single file: read, parse, run detectors.
|
|
146
|
+
|
|
147
|
+
This function is designed to be called from worker threads.
|
|
148
|
+
Tree-sitter releases the GIL during parsing, so ThreadPoolExecutor
|
|
149
|
+
gives real parallelism for the parse step.
|
|
150
|
+
"""
|
|
151
|
+
abs_path = repo_path / file.path
|
|
152
|
+
try:
|
|
153
|
+
content = abs_path.read_bytes()
|
|
154
|
+
except OSError:
|
|
155
|
+
logger.warning("Could not read file %s", abs_path)
|
|
156
|
+
return file, DetectorResult()
|
|
157
|
+
|
|
158
|
+
tree = None
|
|
159
|
+
parsed_data = None
|
|
160
|
+
|
|
161
|
+
# Tree-sitter parse for supported languages
|
|
162
|
+
if parser_manager is not None and file.language in _TREESITTER_LANGUAGES:
|
|
163
|
+
try:
|
|
164
|
+
tree = parser_manager.parse_file(file, content)
|
|
165
|
+
except Exception:
|
|
166
|
+
logger.debug("Tree-sitter parse failed for %s", file.path, exc_info=True)
|
|
167
|
+
|
|
168
|
+
# Structured file parsing
|
|
169
|
+
if file.language in _STRUCTURED_LANGUAGES:
|
|
170
|
+
try:
|
|
171
|
+
parsed_data = _parse_structured(file.language, content, str(file.path))
|
|
172
|
+
except Exception:
|
|
173
|
+
logger.debug("Structured parse failed for %s", file.path, exc_info=True)
|
|
174
|
+
|
|
175
|
+
module_name = _derive_module_name(file.path, file.language)
|
|
176
|
+
|
|
177
|
+
ctx = DetectorContext(
|
|
178
|
+
file_path=str(file.path),
|
|
179
|
+
language=file.language,
|
|
180
|
+
content=content,
|
|
181
|
+
tree=tree,
|
|
182
|
+
parsed_data=parsed_data,
|
|
183
|
+
module_name=module_name,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
merged = DetectorResult()
|
|
187
|
+
for detector in registry.detectors_for_language(file.language):
|
|
188
|
+
try:
|
|
189
|
+
result = detector.detect(ctx)
|
|
190
|
+
merged.nodes.extend(result.nodes)
|
|
191
|
+
merged.edges.extend(result.edges)
|
|
192
|
+
except Exception:
|
|
193
|
+
logger.warning(
|
|
194
|
+
"Detector %s failed on %s",
|
|
195
|
+
detector.name,
|
|
196
|
+
file.path,
|
|
197
|
+
exc_info=True,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return file, merged
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _derive_module_name(path: Path, language: str) -> str | None:
|
|
204
|
+
"""Best-effort module name from file path."""
|
|
205
|
+
parts = path.parts
|
|
206
|
+
joined = "/".join(parts)
|
|
207
|
+
|
|
208
|
+
if language == "java":
|
|
209
|
+
for marker in ("src/main/java/", "src/test/java/"):
|
|
210
|
+
if marker in joined:
|
|
211
|
+
idx = joined.index(marker) + len(marker)
|
|
212
|
+
remainder = joined[idx:]
|
|
213
|
+
pkg = remainder.rsplit("/", 1)[0] if "/" in remainder else ""
|
|
214
|
+
return pkg.replace("/", ".") if pkg else None
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
if language == "python":
|
|
218
|
+
parent = path.parent
|
|
219
|
+
if str(parent) == ".":
|
|
220
|
+
return None
|
|
221
|
+
return str(parent).replace("/", ".").replace("\\", ".")
|
|
222
|
+
|
|
223
|
+
# For XML/YAML/etc., use parent directory as module name
|
|
224
|
+
if language in _STRUCTURED_LANGUAGES:
|
|
225
|
+
parent = path.parent
|
|
226
|
+
if str(parent) == ".":
|
|
227
|
+
return None
|
|
228
|
+
return str(parent).replace("/", ".").replace("\\", ".")
|
|
229
|
+
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class Analyzer:
|
|
234
|
+
"""Orchestrates the full OSSCodeIQ analysis pipeline.
|
|
235
|
+
|
|
236
|
+
Steps:
|
|
237
|
+
1. Discover files (FileDiscovery)
|
|
238
|
+
2. If incremental, detect changed files and load cached results for unchanged
|
|
239
|
+
3. Parse and run detectors on changed/new files
|
|
240
|
+
4. Aggregate results in GraphBuilder
|
|
241
|
+
5. Run cross-file linkers
|
|
242
|
+
6. Cache new results
|
|
243
|
+
7. Return AnalysisResult
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(self, config: Config | None = None) -> None:
|
|
247
|
+
self._config = config or Config()
|
|
248
|
+
self._registry = DetectorRegistry()
|
|
249
|
+
self._registry.load_builtin_detectors()
|
|
250
|
+
self._registry.load_plugin_detectors()
|
|
251
|
+
|
|
252
|
+
# Create ParserManager once (thread-safe via internal pool)
|
|
253
|
+
self._parser_manager = None
|
|
254
|
+
try:
|
|
255
|
+
from osscodeiq.parsing.parser_manager import ParserManager
|
|
256
|
+
self._parser_manager = ParserManager()
|
|
257
|
+
except Exception:
|
|
258
|
+
logger.warning("ParserManager unavailable, tree-sitter parsing disabled", exc_info=True)
|
|
259
|
+
|
|
260
|
+
def run(
|
|
261
|
+
self,
|
|
262
|
+
repo_path: Path,
|
|
263
|
+
incremental: bool = True,
|
|
264
|
+
on_progress: Any | None = None,
|
|
265
|
+
) -> AnalysisResult:
|
|
266
|
+
"""Execute the analysis pipeline on *repo_path*.
|
|
267
|
+
|
|
268
|
+
*on_progress*, when provided, is called with a status string at
|
|
269
|
+
each major pipeline milestone.
|
|
270
|
+
"""
|
|
271
|
+
def _report(msg: str) -> None:
|
|
272
|
+
if on_progress is not None:
|
|
273
|
+
on_progress(msg)
|
|
274
|
+
|
|
275
|
+
repo_path = repo_path.resolve()
|
|
276
|
+
|
|
277
|
+
# ----------------------------------------------------------
|
|
278
|
+
# 1. Discover files
|
|
279
|
+
# ----------------------------------------------------------
|
|
280
|
+
_report("🔍 Discovering files…")
|
|
281
|
+
discovery = FileDiscovery(self._config)
|
|
282
|
+
all_files = discovery.discover(repo_path)
|
|
283
|
+
current_commit = discovery.current_commit
|
|
284
|
+
total_files = len(all_files)
|
|
285
|
+
|
|
286
|
+
# Compute language breakdown and detector coverage
|
|
287
|
+
language_breakdown: dict[str, int] = {}
|
|
288
|
+
files_with_detectors = 0
|
|
289
|
+
files_without_detectors = 0
|
|
290
|
+
for f in all_files:
|
|
291
|
+
language_breakdown[f.language] = language_breakdown.get(f.language, 0) + 1
|
|
292
|
+
if self._registry.detectors_for_language(f.language):
|
|
293
|
+
files_with_detectors += 1
|
|
294
|
+
else:
|
|
295
|
+
files_without_detectors += 1
|
|
296
|
+
|
|
297
|
+
_report(f"📁 Found {total_files} files")
|
|
298
|
+
logger.info("Discovered %d files in %s", total_files, repo_path)
|
|
299
|
+
|
|
300
|
+
# ----------------------------------------------------------
|
|
301
|
+
# 2. Determine which files need (re-)analysis
|
|
302
|
+
# ----------------------------------------------------------
|
|
303
|
+
cache_cfg = self._config.cache
|
|
304
|
+
cache: CacheStore | None = None
|
|
305
|
+
files_to_analyze: list[DiscoveredFile] = all_files
|
|
306
|
+
files_cached = 0
|
|
307
|
+
|
|
308
|
+
from osscodeiq.graph.backends import create_backend
|
|
309
|
+
# Ensure parent directory exists for file-based backends
|
|
310
|
+
graph_path = self._config.graph.path
|
|
311
|
+
if graph_path:
|
|
312
|
+
Path(graph_path).parent.mkdir(parents=True, exist_ok=True)
|
|
313
|
+
backend = create_backend(self._config.graph.backend, path=graph_path)
|
|
314
|
+
builder = GraphBuilder(backend=backend)
|
|
315
|
+
|
|
316
|
+
if cache_cfg.enabled:
|
|
317
|
+
cache_path = repo_path / cache_cfg.directory / cache_cfg.db_name
|
|
318
|
+
cache = CacheStore(cache_path)
|
|
319
|
+
|
|
320
|
+
if incremental and cache is not None:
|
|
321
|
+
last_commit = cache.get_last_commit()
|
|
322
|
+
|
|
323
|
+
# Use ChangeDetector to find deleted files and purge stale cache
|
|
324
|
+
if last_commit and current_commit and last_commit != current_commit:
|
|
325
|
+
try:
|
|
326
|
+
change_detector = ChangeDetector()
|
|
327
|
+
changes = change_detector.detect_changes(repo_path, last_commit)
|
|
328
|
+
for changed in changes:
|
|
329
|
+
if changed.change_type == ChangeType.DELETED:
|
|
330
|
+
cache.remove_by_path(str(changed.path))
|
|
331
|
+
elif changed.change_type == ChangeType.MODIFIED:
|
|
332
|
+
cache.remove_by_path(str(changed.path))
|
|
333
|
+
except Exception:
|
|
334
|
+
logger.debug("ChangeDetector failed, falling back to hash-based", exc_info=True)
|
|
335
|
+
|
|
336
|
+
# Partition files into cached vs needs-analysis
|
|
337
|
+
files_to_analyze = []
|
|
338
|
+
for f in all_files:
|
|
339
|
+
if cache.is_cached(f.content_hash):
|
|
340
|
+
nodes, edges = cache.load_cached_results(f.content_hash)
|
|
341
|
+
builder.add_nodes(nodes)
|
|
342
|
+
builder.add_edges(edges)
|
|
343
|
+
files_cached += 1
|
|
344
|
+
else:
|
|
345
|
+
files_to_analyze.append(f)
|
|
346
|
+
|
|
347
|
+
_report(f"💾 {files_cached} cached, {len(files_to_analyze)} to analyze")
|
|
348
|
+
logger.info(
|
|
349
|
+
"Incremental: %d cached, %d to analyze",
|
|
350
|
+
files_cached,
|
|
351
|
+
len(files_to_analyze),
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
files_analyzed = len(files_to_analyze)
|
|
355
|
+
|
|
356
|
+
# ----------------------------------------------------------
|
|
357
|
+
# 3 & 4. Parse and run detectors
|
|
358
|
+
# ----------------------------------------------------------
|
|
359
|
+
if files_to_analyze:
|
|
360
|
+
_report(f"⚙️ Analyzing {files_analyzed} files…")
|
|
361
|
+
parallelism = self._config.analysis.parallelism
|
|
362
|
+
|
|
363
|
+
pm = self._parser_manager
|
|
364
|
+
|
|
365
|
+
if parallelism <= 1 or len(files_to_analyze) <= 1:
|
|
366
|
+
results = [
|
|
367
|
+
_analyze_file(f, repo_path, self._registry, pm)
|
|
368
|
+
for f in files_to_analyze
|
|
369
|
+
]
|
|
370
|
+
else:
|
|
371
|
+
max_workers = min(parallelism, len(files_to_analyze))
|
|
372
|
+
# Use a list aligned with files_to_analyze to preserve
|
|
373
|
+
# deterministic ordering regardless of thread completion order.
|
|
374
|
+
result_slots: list[tuple[DiscoveredFile, DetectorResult] | None] = [None] * len(files_to_analyze)
|
|
375
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
376
|
+
futures = {
|
|
377
|
+
executor.submit(
|
|
378
|
+
_analyze_file, f, repo_path, self._registry, pm
|
|
379
|
+
): idx
|
|
380
|
+
for idx, f in enumerate(files_to_analyze)
|
|
381
|
+
}
|
|
382
|
+
for future in as_completed(futures):
|
|
383
|
+
idx = futures[future]
|
|
384
|
+
try:
|
|
385
|
+
result_slots[idx] = future.result()
|
|
386
|
+
except Exception:
|
|
387
|
+
logger.warning(
|
|
388
|
+
"Analysis failed for %s",
|
|
389
|
+
files_to_analyze[idx].path,
|
|
390
|
+
exc_info=True,
|
|
391
|
+
)
|
|
392
|
+
results = [r for r in result_slots if r is not None]
|
|
393
|
+
|
|
394
|
+
# ----------------------------------------------------------
|
|
395
|
+
# 5. Aggregate results into graph builder
|
|
396
|
+
# ----------------------------------------------------------
|
|
397
|
+
for file, detector_result in results:
|
|
398
|
+
builder.merge_detector_result(detector_result)
|
|
399
|
+
|
|
400
|
+
# Cache new results
|
|
401
|
+
if cache is not None:
|
|
402
|
+
try:
|
|
403
|
+
cache.store_results(
|
|
404
|
+
content_hash=file.content_hash,
|
|
405
|
+
file_path=str(file.path),
|
|
406
|
+
language=file.language,
|
|
407
|
+
nodes=detector_result.nodes,
|
|
408
|
+
edges=detector_result.edges,
|
|
409
|
+
)
|
|
410
|
+
except Exception:
|
|
411
|
+
logger.warning(
|
|
412
|
+
"Failed to cache results for %s",
|
|
413
|
+
file.path,
|
|
414
|
+
exc_info=True,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# ----------------------------------------------------------
|
|
418
|
+
# 6. Run cross-file linkers
|
|
419
|
+
# ----------------------------------------------------------
|
|
420
|
+
_report("🔗 Linking cross-file relationships…")
|
|
421
|
+
builder.run_linkers()
|
|
422
|
+
|
|
423
|
+
# ----------------------------------------------------------
|
|
424
|
+
# 6b. Classify layers (after linkers so all nodes are covered)
|
|
425
|
+
# ----------------------------------------------------------
|
|
426
|
+
from osscodeiq.classifiers.layer_classifier import LayerClassifier
|
|
427
|
+
LayerClassifier().classify_store(builder._store)
|
|
428
|
+
|
|
429
|
+
# ----------------------------------------------------------
|
|
430
|
+
# 7. Record run and return result
|
|
431
|
+
# ----------------------------------------------------------
|
|
432
|
+
if cache is not None:
|
|
433
|
+
try:
|
|
434
|
+
cache.record_run(
|
|
435
|
+
commit_sha=current_commit or "",
|
|
436
|
+
file_count=total_files,
|
|
437
|
+
)
|
|
438
|
+
except Exception:
|
|
439
|
+
logger.warning("Failed to record analysis run", exc_info=True)
|
|
440
|
+
finally:
|
|
441
|
+
cache.close()
|
|
442
|
+
|
|
443
|
+
graph = builder.build()
|
|
444
|
+
|
|
445
|
+
# Compute node breakdown
|
|
446
|
+
node_breakdown: dict[str, int] = {}
|
|
447
|
+
for node in graph.all_nodes():
|
|
448
|
+
kind = node.kind.value
|
|
449
|
+
node_breakdown[kind] = node_breakdown.get(kind, 0) + 1
|
|
450
|
+
|
|
451
|
+
_report(f"✅ Analysis complete — {graph.node_count} nodes, {graph.edge_count} edges")
|
|
452
|
+
logger.info(
|
|
453
|
+
"Analysis complete: %d nodes, %d edges",
|
|
454
|
+
graph.node_count,
|
|
455
|
+
graph.edge_count,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
return AnalysisResult(
|
|
459
|
+
graph=graph,
|
|
460
|
+
files_analyzed=files_analyzed,
|
|
461
|
+
files_cached=files_cached,
|
|
462
|
+
total_files=total_files,
|
|
463
|
+
language_breakdown=language_breakdown,
|
|
464
|
+
node_breakdown=node_breakdown,
|
|
465
|
+
files_with_detectors=files_with_detectors,
|
|
466
|
+
files_without_detectors=files_without_detectors,
|
|
467
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Content hashing utilities for cache invalidation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def hash_file_content(content: bytes) -> str:
|
|
10
|
+
"""Return the SHA-256 hex digest of *content*."""
|
|
11
|
+
return hashlib.sha256(content).hexdigest()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def hash_file(path: Path) -> str:
|
|
15
|
+
"""Read *path* and return its SHA-256 hex digest.
|
|
16
|
+
|
|
17
|
+
Reads in 8 KiB chunks to handle large files efficiently.
|
|
18
|
+
"""
|
|
19
|
+
h = hashlib.sha256()
|
|
20
|
+
with open(path, "rb") as f:
|
|
21
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
22
|
+
h.update(chunk)
|
|
23
|
+
return h.hexdigest()
|