modwire-extraction 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modwire_extraction/__init__.py +3 -0
- modwire_extraction/_version.py +6 -0
- modwire_extraction/code/__init__.py +20 -0
- modwire_extraction/code/code_map.py +30 -0
- modwire_extraction/code/query.py +223 -0
- modwire_extraction/dependency/__init__.py +6 -0
- modwire_extraction/dependency/graph.py +126 -0
- modwire_extraction/extraction.py +35 -0
- modwire_extraction/extractors/__init__.py +2 -0
- modwire_extraction/extractors/languages/__init__.py +7 -0
- modwire_extraction/extractors/languages/base.py +219 -0
- modwire_extraction/extractors/languages/loader.py +28 -0
- modwire_extraction/extractors/languages/php/build.php +50 -0
- modwire_extraction/extractors/languages/php/composer.json +14 -0
- modwire_extraction/extractors/languages/php/composer.lock +77 -0
- modwire_extraction/extractors/languages/php/script.php +0 -0
- modwire_extraction/extractors/languages/php/script.src.php +785 -0
- modwire_extraction/extractors/languages/php/source.py +26 -0
- modwire_extraction/extractors/languages/python/script.py +1000 -0
- modwire_extraction/extractors/languages/python/source.py +21 -0
- modwire_extraction/extractors/languages/typescript/build.mjs +29 -0
- modwire_extraction/extractors/languages/typescript/package-lock.json +351 -0
- modwire_extraction/extractors/languages/typescript/package.json +16 -0
- modwire_extraction/extractors/languages/typescript/script.js +234645 -0
- modwire_extraction/extractors/languages/typescript/script.ts +1110 -0
- modwire_extraction/extractors/languages/typescript/source.py +26 -0
- modwire_extraction/extractors/languages/typescript/tsconfig.json +14 -0
- modwire_extraction/extractors/source.py +200 -0
- modwire_extraction-1.0.0.dist-info/METADATA +98 -0
- modwire_extraction-1.0.0.dist-info/RECORD +33 -0
- modwire_extraction-1.0.0.dist-info/WHEEL +5 -0
- modwire_extraction-1.0.0.dist-info/licenses/LICENSE +21 -0
- modwire_extraction-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .code_map import CodeMap
|
|
2
|
+
from .query import (
|
|
3
|
+
DependencyEdgeResult,
|
|
4
|
+
DependencyNodeResult,
|
|
5
|
+
QueryableCodeMap,
|
|
6
|
+
QueryBuilder,
|
|
7
|
+
SourceFileResult,
|
|
8
|
+
SourceItemResult,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"CodeMap",
|
|
14
|
+
"DependencyEdgeResult",
|
|
15
|
+
"DependencyNodeResult",
|
|
16
|
+
"QueryableCodeMap",
|
|
17
|
+
"QueryBuilder",
|
|
18
|
+
"SourceFileResult",
|
|
19
|
+
"SourceItemResult",
|
|
20
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
from ..dependency.graph import DependencyGraph
|
|
8
|
+
from ..extractors.languages.base import SourceExtraction
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CodeMap(BaseModel):
|
|
12
|
+
model_config = ConfigDict(frozen=True)
|
|
13
|
+
|
|
14
|
+
language: str
|
|
15
|
+
extraction: SourceExtraction
|
|
16
|
+
dependency_graph: DependencyGraph
|
|
17
|
+
|
|
18
|
+
def to_dict(self) -> dict[str, Any]:
|
|
19
|
+
return self.model_dump(mode="python")
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_dict(cls, payload: object) -> CodeMap:
|
|
23
|
+
return cls.model_validate(payload)
|
|
24
|
+
|
|
25
|
+
def to_json(self) -> str:
|
|
26
|
+
return self.model_dump_json()
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_json(cls, payload: str | bytes) -> CodeMap:
|
|
30
|
+
return cls.model_validate_json(payload)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Iterable
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Generic, TypeVar
|
|
6
|
+
|
|
7
|
+
from ..dependency.graph import Edge, Node
|
|
8
|
+
from ..extractors.source import (
|
|
9
|
+
SourceAbstractClass,
|
|
10
|
+
SourceCall,
|
|
11
|
+
SourceCallable,
|
|
12
|
+
SourceClass,
|
|
13
|
+
SourceExport,
|
|
14
|
+
SourceFile,
|
|
15
|
+
SourceFunction,
|
|
16
|
+
SourceImport,
|
|
17
|
+
SourceInterface,
|
|
18
|
+
SourceType,
|
|
19
|
+
SourceValue,
|
|
20
|
+
)
|
|
21
|
+
from .code_map import CodeMap
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
T = TypeVar("T")
|
|
25
|
+
SourceItem = TypeVar("SourceItem")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class QueryBuilder(Generic[T]):
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
items: Iterable[T],
|
|
32
|
+
predicates: tuple[Callable[[T], bool], ...] = (),
|
|
33
|
+
):
|
|
34
|
+
self._items = tuple(items)
|
|
35
|
+
self._predicates = predicates
|
|
36
|
+
|
|
37
|
+
def where(self, predicate: Callable[[T], bool]) -> QueryBuilder[T]:
|
|
38
|
+
return QueryBuilder(self._items, (*self._predicates, predicate))
|
|
39
|
+
|
|
40
|
+
def where_equal(
|
|
41
|
+
self,
|
|
42
|
+
selector: Callable[[T], object],
|
|
43
|
+
expected: object,
|
|
44
|
+
) -> QueryBuilder[T]:
|
|
45
|
+
return self.where(lambda item: selector(item) == expected)
|
|
46
|
+
|
|
47
|
+
def where_contains(
|
|
48
|
+
self,
|
|
49
|
+
selector: Callable[[T], str],
|
|
50
|
+
expected: str,
|
|
51
|
+
*,
|
|
52
|
+
case_sensitive: bool = True,
|
|
53
|
+
) -> QueryBuilder[T]:
|
|
54
|
+
if case_sensitive:
|
|
55
|
+
return self.where(lambda item: expected in selector(item))
|
|
56
|
+
|
|
57
|
+
lowered = expected.casefold()
|
|
58
|
+
return self.where(lambda item: lowered in selector(item).casefold())
|
|
59
|
+
|
|
60
|
+
def all(self) -> tuple[T, ...]:
|
|
61
|
+
return tuple(
|
|
62
|
+
item
|
|
63
|
+
for item in self._items
|
|
64
|
+
if all(predicate(item) for predicate in self._predicates)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def first(self) -> T | None:
|
|
68
|
+
for item in self.all():
|
|
69
|
+
return item
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def count(self) -> int:
|
|
73
|
+
return len(self.all())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(frozen=True)
|
|
77
|
+
class SourceFileResult:
|
|
78
|
+
source_id: str
|
|
79
|
+
file: SourceFile
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class SourceItemResult(Generic[SourceItem]):
|
|
84
|
+
source_id: str
|
|
85
|
+
file: SourceFile
|
|
86
|
+
item: SourceItem
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class DependencyNodeResult:
|
|
91
|
+
node_id: str
|
|
92
|
+
node: Node
|
|
93
|
+
file: SourceFile | None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass(frozen=True)
|
|
97
|
+
class DependencyEdgeResult:
|
|
98
|
+
edge: Edge
|
|
99
|
+
source_file: SourceFile | None
|
|
100
|
+
target_file: SourceFile | None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class QueryableCodeMap:
|
|
104
|
+
def __init__(self, code_map: CodeMap):
|
|
105
|
+
self.code_map = code_map
|
|
106
|
+
self.cm = code_map
|
|
107
|
+
|
|
108
|
+
def query(self, items: Iterable[T]) -> QueryBuilder[T]:
|
|
109
|
+
return QueryBuilder(items)
|
|
110
|
+
|
|
111
|
+
def source_ids(self) -> tuple[str, ...]:
|
|
112
|
+
return tuple(self.code_map.extraction.files)
|
|
113
|
+
|
|
114
|
+
def has_source_file(self, source_id: str) -> bool:
|
|
115
|
+
return source_id in self.code_map.extraction.files
|
|
116
|
+
|
|
117
|
+
def source_file(self, source_id: str) -> SourceFileResult | None:
|
|
118
|
+
source_file = self.code_map.extraction.files.get(source_id)
|
|
119
|
+
if source_file is None:
|
|
120
|
+
return None
|
|
121
|
+
return SourceFileResult(source_id=source_id, file=source_file)
|
|
122
|
+
|
|
123
|
+
def files(self) -> QueryBuilder[SourceFileResult]:
|
|
124
|
+
return self.source_files()
|
|
125
|
+
|
|
126
|
+
def source_files(self) -> QueryBuilder[SourceFileResult]:
|
|
127
|
+
return QueryBuilder(
|
|
128
|
+
SourceFileResult(source_id=source_id, file=source_file)
|
|
129
|
+
for source_id, source_file in self.code_map.extraction.files.items()
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def imports(self) -> QueryBuilder[SourceItemResult[SourceImport]]:
|
|
133
|
+
return self._source_items(lambda source_file: source_file.imports)
|
|
134
|
+
|
|
135
|
+
def exports(self) -> QueryBuilder[SourceItemResult[SourceExport]]:
|
|
136
|
+
return self._source_items(lambda source_file: source_file.exports)
|
|
137
|
+
|
|
138
|
+
def classes(self) -> QueryBuilder[SourceItemResult[SourceClass]]:
|
|
139
|
+
return self._source_items(lambda source_file: source_file.classes)
|
|
140
|
+
|
|
141
|
+
def interfaces(self) -> QueryBuilder[SourceItemResult[SourceInterface]]:
|
|
142
|
+
return self._source_items(lambda source_file: source_file.interfaces)
|
|
143
|
+
|
|
144
|
+
def types(self) -> QueryBuilder[SourceItemResult[SourceType]]:
|
|
145
|
+
return self._source_items(lambda source_file: source_file.types)
|
|
146
|
+
|
|
147
|
+
def abstract_classes(self) -> QueryBuilder[SourceItemResult[SourceAbstractClass]]:
|
|
148
|
+
return self._source_items(lambda source_file: source_file.abstract_classes)
|
|
149
|
+
|
|
150
|
+
def functions(self) -> QueryBuilder[SourceItemResult[SourceFunction]]:
|
|
151
|
+
return self._source_items(lambda source_file: source_file.functions)
|
|
152
|
+
|
|
153
|
+
def values(self) -> QueryBuilder[SourceItemResult[SourceValue]]:
|
|
154
|
+
return self._source_items(lambda source_file: source_file.values)
|
|
155
|
+
|
|
156
|
+
def callables(self) -> QueryBuilder[SourceItemResult[SourceCallable]]:
|
|
157
|
+
return self._source_items(lambda source_file: source_file.callables)
|
|
158
|
+
|
|
159
|
+
def calls(self) -> QueryBuilder[SourceItemResult[SourceCall]]:
|
|
160
|
+
return self._source_items(lambda source_file: source_file.calls)
|
|
161
|
+
|
|
162
|
+
def dependency_nodes(self) -> QueryBuilder[DependencyNodeResult]:
|
|
163
|
+
files = self.code_map.extraction.files
|
|
164
|
+
return QueryBuilder(
|
|
165
|
+
DependencyNodeResult(
|
|
166
|
+
node_id=node_id,
|
|
167
|
+
node=node,
|
|
168
|
+
file=files.get(node_id),
|
|
169
|
+
)
|
|
170
|
+
for node_id, node in self.code_map.dependency_graph.nodes.items()
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def dependency_edges(self) -> QueryBuilder[DependencyEdgeResult]:
|
|
174
|
+
return self._dependency_edges(self.code_map.dependency_graph.edges)
|
|
175
|
+
|
|
176
|
+
def outgoing_dependencies(self, source_id: str) -> QueryBuilder[DependencyEdgeResult]:
|
|
177
|
+
return self._dependency_edges(self.code_map.dependency_graph.outgoing(source_id))
|
|
178
|
+
|
|
179
|
+
def incoming_dependencies(self, source_id: str) -> QueryBuilder[DependencyEdgeResult]:
|
|
180
|
+
return self._dependency_edges(self.code_map.dependency_graph.incoming(source_id))
|
|
181
|
+
|
|
182
|
+
def dependencies_between(
|
|
183
|
+
self,
|
|
184
|
+
source_id: str,
|
|
185
|
+
target_id: str,
|
|
186
|
+
) -> QueryBuilder[DependencyEdgeResult]:
|
|
187
|
+
return self._dependency_edges(
|
|
188
|
+
self.code_map.dependency_graph.edges_between(source_id, target_id)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def tracked_dependency_edges(self) -> QueryBuilder[DependencyEdgeResult]:
|
|
192
|
+
return self._dependency_edges(
|
|
193
|
+
self.code_map.dependency_graph.tracked_edges(self.source_ids())
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def external_dependency_edges(self) -> QueryBuilder[DependencyEdgeResult]:
|
|
197
|
+
return self._dependency_edges(
|
|
198
|
+
self.code_map.dependency_graph.external_edges(self.source_ids())
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def _source_items(
|
|
202
|
+
self,
|
|
203
|
+
selector: Callable[[SourceFile], Iterable[SourceItem]],
|
|
204
|
+
) -> QueryBuilder[SourceItemResult[SourceItem]]:
|
|
205
|
+
return QueryBuilder(
|
|
206
|
+
SourceItemResult(source_id=source_id, file=source_file, item=item)
|
|
207
|
+
for source_id, source_file in self.code_map.extraction.files.items()
|
|
208
|
+
for item in selector(source_file)
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def _dependency_edges(
|
|
212
|
+
self,
|
|
213
|
+
edges: Iterable[Edge],
|
|
214
|
+
) -> QueryBuilder[DependencyEdgeResult]:
|
|
215
|
+
files = self.code_map.extraction.files
|
|
216
|
+
return QueryBuilder(
|
|
217
|
+
DependencyEdgeResult(
|
|
218
|
+
edge=edge,
|
|
219
|
+
source_file=files.get(edge.from_id),
|
|
220
|
+
target_file=files.get(edge.to_id),
|
|
221
|
+
)
|
|
222
|
+
for edge in edges
|
|
223
|
+
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
|
6
|
+
|
|
7
|
+
from ..extractors.source import SourceFile
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Node(BaseModel):
|
|
11
|
+
model_config = ConfigDict(frozen=True)
|
|
12
|
+
|
|
13
|
+
id: str
|
|
14
|
+
kind: str = "file"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Edge(BaseModel):
|
|
18
|
+
model_config = ConfigDict(frozen=True)
|
|
19
|
+
|
|
20
|
+
from_id: str
|
|
21
|
+
to_id: str
|
|
22
|
+
kind: str = "import"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DependencyGraph(BaseModel):
|
|
26
|
+
nodes: dict[str, Node] = Field(default_factory=dict)
|
|
27
|
+
edges: list[Edge] = Field(default_factory=list)
|
|
28
|
+
_outgoing_by_node: dict[str, list[Edge]] = PrivateAttr(default_factory=dict)
|
|
29
|
+
_incoming_by_node: dict[str, list[Edge]] = PrivateAttr(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
def model_post_init(self, __context: Any) -> None:
|
|
32
|
+
self._rebuild_indexes()
|
|
33
|
+
|
|
34
|
+
def _rebuild_indexes(self) -> None:
|
|
35
|
+
self._outgoing_by_node = {node_id: [] for node_id in self.nodes}
|
|
36
|
+
self._incoming_by_node = {node_id: [] for node_id in self.nodes}
|
|
37
|
+
for edge in self.edges:
|
|
38
|
+
self._outgoing_by_node.setdefault(edge.from_id, []).append(edge)
|
|
39
|
+
self._incoming_by_node.setdefault(edge.to_id, []).append(edge)
|
|
40
|
+
|
|
41
|
+
def add_node(self, node_id: str, *, kind: str = "file") -> None:
|
|
42
|
+
self.nodes.setdefault(node_id, Node(id=node_id, kind=kind))
|
|
43
|
+
self._outgoing_by_node.setdefault(node_id, [])
|
|
44
|
+
self._incoming_by_node.setdefault(node_id, [])
|
|
45
|
+
|
|
46
|
+
def add_edge(self, from_id: str, to_id: str, *, kind: str = "import") -> None:
|
|
47
|
+
self.add_node(from_id)
|
|
48
|
+
self.add_node(to_id)
|
|
49
|
+
edge = Edge(from_id=from_id, to_id=to_id, kind=kind)
|
|
50
|
+
self.edges.append(edge)
|
|
51
|
+
self._outgoing_by_node[from_id].append(edge)
|
|
52
|
+
self._incoming_by_node[to_id].append(edge)
|
|
53
|
+
|
|
54
|
+
def outgoing(self, node_id: str) -> tuple[Edge, ...]:
|
|
55
|
+
return tuple(self._outgoing_by_node.get(node_id, ()))
|
|
56
|
+
|
|
57
|
+
def incoming(self, node_id: str) -> tuple[Edge, ...]:
|
|
58
|
+
return tuple(self._incoming_by_node.get(node_id, ()))
|
|
59
|
+
|
|
60
|
+
def edges_between(self, source: str, target: str) -> tuple[Edge, ...]:
|
|
61
|
+
return tuple(
|
|
62
|
+
edge
|
|
63
|
+
for edge in self._outgoing_by_node.get(source, ())
|
|
64
|
+
if edge.to_id == target
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def has_node(self, node_id: str) -> bool:
|
|
68
|
+
return node_id in self.nodes
|
|
69
|
+
|
|
70
|
+
def node_ids(self) -> tuple[str, ...]:
|
|
71
|
+
return tuple(self.nodes.keys())
|
|
72
|
+
|
|
73
|
+
def sorted_nodes(self) -> tuple[Node, ...]:
|
|
74
|
+
return tuple(self.nodes[node_id] for node_id in sorted(self.nodes))
|
|
75
|
+
|
|
76
|
+
def sorted_edges(self) -> tuple[Edge, ...]:
|
|
77
|
+
return tuple(
|
|
78
|
+
sorted(
|
|
79
|
+
self.edges,
|
|
80
|
+
key=lambda edge: (edge.from_id, edge.to_id, edge.kind),
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def subgraph(self, node_ids: set[str] | tuple[str, ...]) -> DependencyGraph:
|
|
85
|
+
selected = set(node_ids)
|
|
86
|
+
graph = DependencyGraph()
|
|
87
|
+
for node_id in sorted(selected):
|
|
88
|
+
if node_id in self.nodes:
|
|
89
|
+
graph.add_node(node_id, kind=self.nodes[node_id].kind)
|
|
90
|
+
for edge in self.edges:
|
|
91
|
+
if edge.from_id in selected and edge.to_id in selected:
|
|
92
|
+
graph.add_edge(edge.from_id, edge.to_id, kind=edge.kind)
|
|
93
|
+
return graph
|
|
94
|
+
|
|
95
|
+
def without_external_nodes(
|
|
96
|
+
self,
|
|
97
|
+
tracked_ids: set[str] | tuple[str, ...],
|
|
98
|
+
) -> DependencyGraph:
|
|
99
|
+
return self.subgraph(set(tracked_ids))
|
|
100
|
+
|
|
101
|
+
def tracked_edges(self, tracked_ids: set[str] | tuple[str, ...]) -> tuple[Edge, ...]:
|
|
102
|
+
tracked = set(tracked_ids)
|
|
103
|
+
return tuple(
|
|
104
|
+
edge
|
|
105
|
+
for edge in self.edges
|
|
106
|
+
if edge.from_id in tracked and edge.to_id in tracked
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def external_edges(self, tracked_ids: set[str] | tuple[str, ...]) -> tuple[Edge, ...]:
|
|
110
|
+
tracked = set(tracked_ids)
|
|
111
|
+
return tuple(
|
|
112
|
+
edge
|
|
113
|
+
for edge in self.edges
|
|
114
|
+
if edge.from_id not in tracked or edge.to_id not in tracked
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def build_dependency_graph(extracted_files: dict[str, SourceFile]) -> DependencyGraph:
|
|
119
|
+
graph = DependencyGraph()
|
|
120
|
+
|
|
121
|
+
for file_path, extracted_file in extracted_files.items():
|
|
122
|
+
graph.add_node(file_path)
|
|
123
|
+
for imported_reference in extracted_file.imports:
|
|
124
|
+
graph.add_edge(file_path, imported_reference.normalized_path)
|
|
125
|
+
|
|
126
|
+
return graph
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from .dependency import build_dependency_graph
|
|
4
|
+
from .extractors import languages
|
|
5
|
+
from .code import CodeMap, QueryableCodeMap
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ModwireExtraction:
|
|
9
|
+
def __init__(self, root: Path):
|
|
10
|
+
self._root = root.resolve()
|
|
11
|
+
|
|
12
|
+
def discover(self) -> tuple[str, ...]:
|
|
13
|
+
discovered: list[str] = []
|
|
14
|
+
for language in languages.get_supported_languages():
|
|
15
|
+
extractor = languages.load_extractor(language)
|
|
16
|
+
if extractor.has_source_files(self._root):
|
|
17
|
+
discovered.append(language)
|
|
18
|
+
return tuple(discovered)
|
|
19
|
+
|
|
20
|
+
def generate_map(self, language: str) -> CodeMap:
|
|
21
|
+
available = languages.get_supported_languages()
|
|
22
|
+
if language not in available:
|
|
23
|
+
raise ValueError(f"Language is not supported: {language}")
|
|
24
|
+
|
|
25
|
+
extraction = languages.load_extractor(language).extract_source(self._root)
|
|
26
|
+
dependency_graph = build_dependency_graph(extraction.files)
|
|
27
|
+
return CodeMap(
|
|
28
|
+
language=language,
|
|
29
|
+
extraction=extraction,
|
|
30
|
+
dependency_graph=dependency_graph,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def generate_queryable_map(self, language: str) -> QueryableCodeMap:
|
|
34
|
+
code_map = self.generate_map(language)
|
|
35
|
+
return QueryableCodeMap(code_map=code_map)
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Literal, cast
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
|
+
|
|
13
|
+
from ..source import SourceFile
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SourceExtraction(BaseModel):
|
|
17
|
+
model_config = ConfigDict(frozen=True)
|
|
18
|
+
|
|
19
|
+
files: dict[str, SourceFile]
|
|
20
|
+
files_found: int
|
|
21
|
+
files_excluded: int
|
|
22
|
+
|
|
23
|
+
def files_dict(self) -> dict[str, SourceFile]:
|
|
24
|
+
return dict(self.files)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class BatchConfig:
|
|
29
|
+
size: int = 500
|
|
30
|
+
parallel_threshold: int = 0
|
|
31
|
+
parallel_size: int = 0
|
|
32
|
+
max_workers: int = 1
|
|
33
|
+
output_format: Literal["json", "jsonl"] = "json"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class ExtractorRuntime:
|
|
38
|
+
language: str
|
|
39
|
+
file_extensions: tuple[str, ...]
|
|
40
|
+
command: tuple[str, ...]
|
|
41
|
+
script_path: Path
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class SourceExtractor(abc.ABC):
|
|
45
|
+
excluded_dir_names = frozenset(
|
|
46
|
+
{
|
|
47
|
+
".git",
|
|
48
|
+
".hg",
|
|
49
|
+
".mypy_cache",
|
|
50
|
+
".pytest_cache",
|
|
51
|
+
".ruff_cache",
|
|
52
|
+
".svn",
|
|
53
|
+
".venv",
|
|
54
|
+
"__pycache__",
|
|
55
|
+
"build",
|
|
56
|
+
"coverage",
|
|
57
|
+
"dist",
|
|
58
|
+
"ignored",
|
|
59
|
+
"node_modules",
|
|
60
|
+
"vendor",
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
@abc.abstractmethod
|
|
66
|
+
def runtime(self) -> ExtractorRuntime:
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
@abc.abstractmethod
|
|
71
|
+
def batch_config(self) -> BatchConfig:
|
|
72
|
+
raise NotImplementedError
|
|
73
|
+
|
|
74
|
+
def has_source_files(self, root: Path) -> bool:
|
|
75
|
+
resolved_root = root.resolve()
|
|
76
|
+
if not resolved_root.is_dir():
|
|
77
|
+
raise ValueError(f"Source root is not a directory: {root}")
|
|
78
|
+
|
|
79
|
+
source_paths, _ = self._discover_source_files(resolved_root)
|
|
80
|
+
return bool(source_paths)
|
|
81
|
+
|
|
82
|
+
def extract_source(self, root: Path) -> SourceExtraction:
|
|
83
|
+
resolved_root = root.resolve()
|
|
84
|
+
if not resolved_root.is_dir():
|
|
85
|
+
raise ValueError(f"Source root is not a directory: {root}")
|
|
86
|
+
|
|
87
|
+
source_paths, files_excluded = self._discover_source_files(resolved_root)
|
|
88
|
+
files: dict[str, SourceFile] = {}
|
|
89
|
+
batch_size = max(1, self.batch_config.size)
|
|
90
|
+
|
|
91
|
+
for start in range(0, len(source_paths), batch_size):
|
|
92
|
+
batch_paths = source_paths[start : start + batch_size]
|
|
93
|
+
files.update(self._extract_batch(resolved_root, batch_paths))
|
|
94
|
+
|
|
95
|
+
return SourceExtraction(
|
|
96
|
+
files=files,
|
|
97
|
+
files_found=len(source_paths),
|
|
98
|
+
files_excluded=files_excluded,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def _discover_source_files(self, root: Path) -> tuple[list[Path], int]:
|
|
102
|
+
source_paths: list[Path] = []
|
|
103
|
+
files_excluded = 0
|
|
104
|
+
extensions = self.runtime.file_extensions
|
|
105
|
+
|
|
106
|
+
for current_root, dir_names, file_names in os.walk(root):
|
|
107
|
+
current_path = Path(current_root)
|
|
108
|
+
excluded_dirs = [
|
|
109
|
+
dir_name for dir_name in dir_names if self._is_excluded_dir(dir_name)
|
|
110
|
+
]
|
|
111
|
+
files_excluded += sum(
|
|
112
|
+
self._count_source_files(current_path / dir_name)
|
|
113
|
+
for dir_name in excluded_dirs
|
|
114
|
+
)
|
|
115
|
+
dir_names[:] = [
|
|
116
|
+
dir_name for dir_name in dir_names if dir_name not in excluded_dirs
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
for file_name in file_names:
|
|
120
|
+
file_path = current_path / file_name
|
|
121
|
+
if file_path.suffix.lower() in extensions:
|
|
122
|
+
source_paths.append(file_path.resolve())
|
|
123
|
+
|
|
124
|
+
return sorted(source_paths), files_excluded
|
|
125
|
+
|
|
126
|
+
def _count_source_files(self, root: Path) -> int:
|
|
127
|
+
count = 0
|
|
128
|
+
extensions = self.runtime.file_extensions
|
|
129
|
+
for current_root, dir_names, file_names in os.walk(root):
|
|
130
|
+
dir_names[:] = [
|
|
131
|
+
dir_name
|
|
132
|
+
for dir_name in dir_names
|
|
133
|
+
if not self._is_excluded_dir(dir_name)
|
|
134
|
+
]
|
|
135
|
+
count += sum(
|
|
136
|
+
1
|
|
137
|
+
for file_name in file_names
|
|
138
|
+
if (Path(current_root) / file_name).suffix.lower() in extensions
|
|
139
|
+
)
|
|
140
|
+
return count
|
|
141
|
+
|
|
142
|
+
def _is_excluded_dir(self, name: str) -> bool:
|
|
143
|
+
return name in self.excluded_dir_names or name.startswith(".")
|
|
144
|
+
|
|
145
|
+
def _extract_batch(self, root: Path, source_paths: list[Path]) -> dict[str, SourceFile]:
|
|
146
|
+
if not source_paths:
|
|
147
|
+
return {}
|
|
148
|
+
|
|
149
|
+
runtime = self.runtime
|
|
150
|
+
if not runtime.script_path.is_file():
|
|
151
|
+
raise RuntimeError(
|
|
152
|
+
f"{runtime.language} extractor script is missing: {runtime.script_path}"
|
|
153
|
+
)
|
|
154
|
+
executable = runtime.command[0]
|
|
155
|
+
if shutil.which(executable) is None:
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
f"{runtime.language} extractor runtime is not available on PATH: "
|
|
158
|
+
f"{executable}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
paths_by_source_id = {
|
|
162
|
+
self._source_id_for_path(root, source_path): str(source_path)
|
|
163
|
+
for source_path in source_paths
|
|
164
|
+
}
|
|
165
|
+
command = [
|
|
166
|
+
*runtime.command,
|
|
167
|
+
str(runtime.script_path),
|
|
168
|
+
"--batch",
|
|
169
|
+
str(root),
|
|
170
|
+
]
|
|
171
|
+
if self.batch_config.output_format == "jsonl":
|
|
172
|
+
command.append("--jsonl")
|
|
173
|
+
|
|
174
|
+
result = subprocess.run(
|
|
175
|
+
command,
|
|
176
|
+
input=json.dumps(paths_by_source_id),
|
|
177
|
+
text=True,
|
|
178
|
+
capture_output=True,
|
|
179
|
+
check=False,
|
|
180
|
+
)
|
|
181
|
+
if result.returncode != 0:
|
|
182
|
+
message = result.stderr.strip() or result.stdout.strip()
|
|
183
|
+
raise RuntimeError(
|
|
184
|
+
f"{runtime.language} extractor failed with exit code "
|
|
185
|
+
f"{result.returncode}: {message}"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
extracted = self._parse_batch_output(result.stdout)
|
|
189
|
+
return {
|
|
190
|
+
source_id: SourceFile.model_validate(source_file)
|
|
191
|
+
for source_id, source_file in extracted.items()
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
def _parse_batch_output(self, output: str) -> dict[str, Any]:
|
|
195
|
+
if self.batch_config.output_format == "jsonl":
|
|
196
|
+
result: dict[str, Any] = {}
|
|
197
|
+
for line in output.splitlines():
|
|
198
|
+
if not line.strip():
|
|
199
|
+
continue
|
|
200
|
+
item: Any = json.loads(line)
|
|
201
|
+
if not isinstance(item, list):
|
|
202
|
+
raise RuntimeError("Extractor returned invalid JSONL batch output.")
|
|
203
|
+
item_list = cast(list[Any], item)
|
|
204
|
+
if len(item_list) != 2:
|
|
205
|
+
raise RuntimeError("Extractor returned invalid JSONL batch output.")
|
|
206
|
+
source_id, source_file = item_list
|
|
207
|
+
if not isinstance(source_id, str):
|
|
208
|
+
raise RuntimeError("Extractor returned a non-string source id.")
|
|
209
|
+
result[source_id] = source_file
|
|
210
|
+
return result
|
|
211
|
+
|
|
212
|
+
parsed: Any = json.loads(output)
|
|
213
|
+
if not isinstance(parsed, dict):
|
|
214
|
+
raise RuntimeError("Extractor returned invalid JSON batch output.")
|
|
215
|
+
return cast(dict[str, Any], parsed)
|
|
216
|
+
|
|
217
|
+
def _source_id_for_path(self, root: Path, path: Path) -> str:
|
|
218
|
+
relative_path = path.relative_to(root)
|
|
219
|
+
return relative_path.with_suffix("").as_posix().strip("/")
|