modwire-extraction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. modwire_extraction/__init__.py +3 -0
  2. modwire_extraction/_version.py +6 -0
  3. modwire_extraction/code/__init__.py +20 -0
  4. modwire_extraction/code/code_map.py +30 -0
  5. modwire_extraction/code/query.py +223 -0
  6. modwire_extraction/dependency/__init__.py +6 -0
  7. modwire_extraction/dependency/graph.py +126 -0
  8. modwire_extraction/extraction.py +35 -0
  9. modwire_extraction/extractors/__init__.py +2 -0
  10. modwire_extraction/extractors/languages/__init__.py +7 -0
  11. modwire_extraction/extractors/languages/base.py +219 -0
  12. modwire_extraction/extractors/languages/loader.py +28 -0
  13. modwire_extraction/extractors/languages/php/build.php +50 -0
  14. modwire_extraction/extractors/languages/php/composer.json +14 -0
  15. modwire_extraction/extractors/languages/php/composer.lock +77 -0
  16. modwire_extraction/extractors/languages/php/script.php +0 -0
  17. modwire_extraction/extractors/languages/php/script.src.php +785 -0
  18. modwire_extraction/extractors/languages/php/source.py +26 -0
  19. modwire_extraction/extractors/languages/python/script.py +1000 -0
  20. modwire_extraction/extractors/languages/python/source.py +21 -0
  21. modwire_extraction/extractors/languages/typescript/build.mjs +29 -0
  22. modwire_extraction/extractors/languages/typescript/package-lock.json +351 -0
  23. modwire_extraction/extractors/languages/typescript/package.json +16 -0
  24. modwire_extraction/extractors/languages/typescript/script.js +234645 -0
  25. modwire_extraction/extractors/languages/typescript/script.ts +1110 -0
  26. modwire_extraction/extractors/languages/typescript/source.py +26 -0
  27. modwire_extraction/extractors/languages/typescript/tsconfig.json +14 -0
  28. modwire_extraction/extractors/source.py +200 -0
  29. modwire_extraction-1.0.0.dist-info/METADATA +98 -0
  30. modwire_extraction-1.0.0.dist-info/RECORD +33 -0
  31. modwire_extraction-1.0.0.dist-info/WHEEL +5 -0
  32. modwire_extraction-1.0.0.dist-info/licenses/LICENSE +21 -0
  33. modwire_extraction-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3 @@
1
+ from .extraction import ModwireExtraction
2
+
3
+ __all__ = ["ModwireExtraction"]
@@ -0,0 +1,6 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ try:
4
+ __version__ = version("modwire-extraction")
5
+ except PackageNotFoundError:
6
+ __version__ = "0.0.0"
@@ -0,0 +1,20 @@
1
+ from .code_map import CodeMap
2
+ from .query import (
3
+ DependencyEdgeResult,
4
+ DependencyNodeResult,
5
+ QueryableCodeMap,
6
+ QueryBuilder,
7
+ SourceFileResult,
8
+ SourceItemResult,
9
+ )
10
+
11
+
12
+ __all__ = [
13
+ "CodeMap",
14
+ "DependencyEdgeResult",
15
+ "DependencyNodeResult",
16
+ "QueryableCodeMap",
17
+ "QueryBuilder",
18
+ "SourceFileResult",
19
+ "SourceItemResult",
20
+ ]
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ConfigDict
6
+
7
+ from ..dependency.graph import DependencyGraph
8
+ from ..extractors.languages.base import SourceExtraction
9
+
10
+
11
+ class CodeMap(BaseModel):
12
+ model_config = ConfigDict(frozen=True)
13
+
14
+ language: str
15
+ extraction: SourceExtraction
16
+ dependency_graph: DependencyGraph
17
+
18
+ def to_dict(self) -> dict[str, Any]:
19
+ return self.model_dump(mode="python")
20
+
21
+ @classmethod
22
+ def from_dict(cls, payload: object) -> CodeMap:
23
+ return cls.model_validate(payload)
24
+
25
+ def to_json(self) -> str:
26
+ return self.model_dump_json()
27
+
28
+ @classmethod
29
+ def from_json(cls, payload: str | bytes) -> CodeMap:
30
+ return cls.model_validate_json(payload)
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Iterable
4
+ from dataclasses import dataclass
5
+ from typing import Generic, TypeVar
6
+
7
+ from ..dependency.graph import Edge, Node
8
+ from ..extractors.source import (
9
+ SourceAbstractClass,
10
+ SourceCall,
11
+ SourceCallable,
12
+ SourceClass,
13
+ SourceExport,
14
+ SourceFile,
15
+ SourceFunction,
16
+ SourceImport,
17
+ SourceInterface,
18
+ SourceType,
19
+ SourceValue,
20
+ )
21
+ from .code_map import CodeMap
22
+
23
+
24
+ T = TypeVar("T")
25
+ SourceItem = TypeVar("SourceItem")
26
+
27
+
28
+ class QueryBuilder(Generic[T]):
29
+ def __init__(
30
+ self,
31
+ items: Iterable[T],
32
+ predicates: tuple[Callable[[T], bool], ...] = (),
33
+ ):
34
+ self._items = tuple(items)
35
+ self._predicates = predicates
36
+
37
+ def where(self, predicate: Callable[[T], bool]) -> QueryBuilder[T]:
38
+ return QueryBuilder(self._items, (*self._predicates, predicate))
39
+
40
+ def where_equal(
41
+ self,
42
+ selector: Callable[[T], object],
43
+ expected: object,
44
+ ) -> QueryBuilder[T]:
45
+ return self.where(lambda item: selector(item) == expected)
46
+
47
+ def where_contains(
48
+ self,
49
+ selector: Callable[[T], str],
50
+ expected: str,
51
+ *,
52
+ case_sensitive: bool = True,
53
+ ) -> QueryBuilder[T]:
54
+ if case_sensitive:
55
+ return self.where(lambda item: expected in selector(item))
56
+
57
+ lowered = expected.casefold()
58
+ return self.where(lambda item: lowered in selector(item).casefold())
59
+
60
+ def all(self) -> tuple[T, ...]:
61
+ return tuple(
62
+ item
63
+ for item in self._items
64
+ if all(predicate(item) for predicate in self._predicates)
65
+ )
66
+
67
+ def first(self) -> T | None:
68
+ for item in self.all():
69
+ return item
70
+ return None
71
+
72
+ def count(self) -> int:
73
+ return len(self.all())
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class SourceFileResult:
78
+ source_id: str
79
+ file: SourceFile
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class SourceItemResult(Generic[SourceItem]):
84
+ source_id: str
85
+ file: SourceFile
86
+ item: SourceItem
87
+
88
+
89
+ @dataclass(frozen=True)
90
+ class DependencyNodeResult:
91
+ node_id: str
92
+ node: Node
93
+ file: SourceFile | None
94
+
95
+
96
+ @dataclass(frozen=True)
97
+ class DependencyEdgeResult:
98
+ edge: Edge
99
+ source_file: SourceFile | None
100
+ target_file: SourceFile | None
101
+
102
+
103
+ class QueryableCodeMap:
104
+ def __init__(self, code_map: CodeMap):
105
+ self.code_map = code_map
106
+ self.cm = code_map
107
+
108
+ def query(self, items: Iterable[T]) -> QueryBuilder[T]:
109
+ return QueryBuilder(items)
110
+
111
+ def source_ids(self) -> tuple[str, ...]:
112
+ return tuple(self.code_map.extraction.files)
113
+
114
+ def has_source_file(self, source_id: str) -> bool:
115
+ return source_id in self.code_map.extraction.files
116
+
117
+ def source_file(self, source_id: str) -> SourceFileResult | None:
118
+ source_file = self.code_map.extraction.files.get(source_id)
119
+ if source_file is None:
120
+ return None
121
+ return SourceFileResult(source_id=source_id, file=source_file)
122
+
123
+ def files(self) -> QueryBuilder[SourceFileResult]:
124
+ return self.source_files()
125
+
126
+ def source_files(self) -> QueryBuilder[SourceFileResult]:
127
+ return QueryBuilder(
128
+ SourceFileResult(source_id=source_id, file=source_file)
129
+ for source_id, source_file in self.code_map.extraction.files.items()
130
+ )
131
+
132
+ def imports(self) -> QueryBuilder[SourceItemResult[SourceImport]]:
133
+ return self._source_items(lambda source_file: source_file.imports)
134
+
135
+ def exports(self) -> QueryBuilder[SourceItemResult[SourceExport]]:
136
+ return self._source_items(lambda source_file: source_file.exports)
137
+
138
+ def classes(self) -> QueryBuilder[SourceItemResult[SourceClass]]:
139
+ return self._source_items(lambda source_file: source_file.classes)
140
+
141
+ def interfaces(self) -> QueryBuilder[SourceItemResult[SourceInterface]]:
142
+ return self._source_items(lambda source_file: source_file.interfaces)
143
+
144
+ def types(self) -> QueryBuilder[SourceItemResult[SourceType]]:
145
+ return self._source_items(lambda source_file: source_file.types)
146
+
147
+ def abstract_classes(self) -> QueryBuilder[SourceItemResult[SourceAbstractClass]]:
148
+ return self._source_items(lambda source_file: source_file.abstract_classes)
149
+
150
+ def functions(self) -> QueryBuilder[SourceItemResult[SourceFunction]]:
151
+ return self._source_items(lambda source_file: source_file.functions)
152
+
153
+ def values(self) -> QueryBuilder[SourceItemResult[SourceValue]]:
154
+ return self._source_items(lambda source_file: source_file.values)
155
+
156
+ def callables(self) -> QueryBuilder[SourceItemResult[SourceCallable]]:
157
+ return self._source_items(lambda source_file: source_file.callables)
158
+
159
+ def calls(self) -> QueryBuilder[SourceItemResult[SourceCall]]:
160
+ return self._source_items(lambda source_file: source_file.calls)
161
+
162
+ def dependency_nodes(self) -> QueryBuilder[DependencyNodeResult]:
163
+ files = self.code_map.extraction.files
164
+ return QueryBuilder(
165
+ DependencyNodeResult(
166
+ node_id=node_id,
167
+ node=node,
168
+ file=files.get(node_id),
169
+ )
170
+ for node_id, node in self.code_map.dependency_graph.nodes.items()
171
+ )
172
+
173
+ def dependency_edges(self) -> QueryBuilder[DependencyEdgeResult]:
174
+ return self._dependency_edges(self.code_map.dependency_graph.edges)
175
+
176
+ def outgoing_dependencies(self, source_id: str) -> QueryBuilder[DependencyEdgeResult]:
177
+ return self._dependency_edges(self.code_map.dependency_graph.outgoing(source_id))
178
+
179
+ def incoming_dependencies(self, source_id: str) -> QueryBuilder[DependencyEdgeResult]:
180
+ return self._dependency_edges(self.code_map.dependency_graph.incoming(source_id))
181
+
182
+ def dependencies_between(
183
+ self,
184
+ source_id: str,
185
+ target_id: str,
186
+ ) -> QueryBuilder[DependencyEdgeResult]:
187
+ return self._dependency_edges(
188
+ self.code_map.dependency_graph.edges_between(source_id, target_id)
189
+ )
190
+
191
+ def tracked_dependency_edges(self) -> QueryBuilder[DependencyEdgeResult]:
192
+ return self._dependency_edges(
193
+ self.code_map.dependency_graph.tracked_edges(self.source_ids())
194
+ )
195
+
196
+ def external_dependency_edges(self) -> QueryBuilder[DependencyEdgeResult]:
197
+ return self._dependency_edges(
198
+ self.code_map.dependency_graph.external_edges(self.source_ids())
199
+ )
200
+
201
+ def _source_items(
202
+ self,
203
+ selector: Callable[[SourceFile], Iterable[SourceItem]],
204
+ ) -> QueryBuilder[SourceItemResult[SourceItem]]:
205
+ return QueryBuilder(
206
+ SourceItemResult(source_id=source_id, file=source_file, item=item)
207
+ for source_id, source_file in self.code_map.extraction.files.items()
208
+ for item in selector(source_file)
209
+ )
210
+
211
+ def _dependency_edges(
212
+ self,
213
+ edges: Iterable[Edge],
214
+ ) -> QueryBuilder[DependencyEdgeResult]:
215
+ files = self.code_map.extraction.files
216
+ return QueryBuilder(
217
+ DependencyEdgeResult(
218
+ edge=edge,
219
+ source_file=files.get(edge.from_id),
220
+ target_file=files.get(edge.to_id),
221
+ )
222
+ for edge in edges
223
+ )
@@ -0,0 +1,6 @@
1
+ from .graph import build_dependency_graph
2
+
3
+
4
+ __all__ = [
5
+ "build_dependency_graph",
6
+ ]
@@ -0,0 +1,126 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
6
+
7
+ from ..extractors.source import SourceFile
8
+
9
+
10
+ class Node(BaseModel):
11
+ model_config = ConfigDict(frozen=True)
12
+
13
+ id: str
14
+ kind: str = "file"
15
+
16
+
17
+ class Edge(BaseModel):
18
+ model_config = ConfigDict(frozen=True)
19
+
20
+ from_id: str
21
+ to_id: str
22
+ kind: str = "import"
23
+
24
+
25
+ class DependencyGraph(BaseModel):
26
+ nodes: dict[str, Node] = Field(default_factory=dict)
27
+ edges: list[Edge] = Field(default_factory=list)
28
+ _outgoing_by_node: dict[str, list[Edge]] = PrivateAttr(default_factory=dict)
29
+ _incoming_by_node: dict[str, list[Edge]] = PrivateAttr(default_factory=dict)
30
+
31
+ def model_post_init(self, __context: Any) -> None:
32
+ self._rebuild_indexes()
33
+
34
+ def _rebuild_indexes(self) -> None:
35
+ self._outgoing_by_node = {node_id: [] for node_id in self.nodes}
36
+ self._incoming_by_node = {node_id: [] for node_id in self.nodes}
37
+ for edge in self.edges:
38
+ self._outgoing_by_node.setdefault(edge.from_id, []).append(edge)
39
+ self._incoming_by_node.setdefault(edge.to_id, []).append(edge)
40
+
41
+ def add_node(self, node_id: str, *, kind: str = "file") -> None:
42
+ self.nodes.setdefault(node_id, Node(id=node_id, kind=kind))
43
+ self._outgoing_by_node.setdefault(node_id, [])
44
+ self._incoming_by_node.setdefault(node_id, [])
45
+
46
+ def add_edge(self, from_id: str, to_id: str, *, kind: str = "import") -> None:
47
+ self.add_node(from_id)
48
+ self.add_node(to_id)
49
+ edge = Edge(from_id=from_id, to_id=to_id, kind=kind)
50
+ self.edges.append(edge)
51
+ self._outgoing_by_node[from_id].append(edge)
52
+ self._incoming_by_node[to_id].append(edge)
53
+
54
+ def outgoing(self, node_id: str) -> tuple[Edge, ...]:
55
+ return tuple(self._outgoing_by_node.get(node_id, ()))
56
+
57
+ def incoming(self, node_id: str) -> tuple[Edge, ...]:
58
+ return tuple(self._incoming_by_node.get(node_id, ()))
59
+
60
+ def edges_between(self, source: str, target: str) -> tuple[Edge, ...]:
61
+ return tuple(
62
+ edge
63
+ for edge in self._outgoing_by_node.get(source, ())
64
+ if edge.to_id == target
65
+ )
66
+
67
+ def has_node(self, node_id: str) -> bool:
68
+ return node_id in self.nodes
69
+
70
+ def node_ids(self) -> tuple[str, ...]:
71
+ return tuple(self.nodes.keys())
72
+
73
+ def sorted_nodes(self) -> tuple[Node, ...]:
74
+ return tuple(self.nodes[node_id] for node_id in sorted(self.nodes))
75
+
76
+ def sorted_edges(self) -> tuple[Edge, ...]:
77
+ return tuple(
78
+ sorted(
79
+ self.edges,
80
+ key=lambda edge: (edge.from_id, edge.to_id, edge.kind),
81
+ )
82
+ )
83
+
84
+ def subgraph(self, node_ids: set[str] | tuple[str, ...]) -> DependencyGraph:
85
+ selected = set(node_ids)
86
+ graph = DependencyGraph()
87
+ for node_id in sorted(selected):
88
+ if node_id in self.nodes:
89
+ graph.add_node(node_id, kind=self.nodes[node_id].kind)
90
+ for edge in self.edges:
91
+ if edge.from_id in selected and edge.to_id in selected:
92
+ graph.add_edge(edge.from_id, edge.to_id, kind=edge.kind)
93
+ return graph
94
+
95
+ def without_external_nodes(
96
+ self,
97
+ tracked_ids: set[str] | tuple[str, ...],
98
+ ) -> DependencyGraph:
99
+ return self.subgraph(set(tracked_ids))
100
+
101
+ def tracked_edges(self, tracked_ids: set[str] | tuple[str, ...]) -> tuple[Edge, ...]:
102
+ tracked = set(tracked_ids)
103
+ return tuple(
104
+ edge
105
+ for edge in self.edges
106
+ if edge.from_id in tracked and edge.to_id in tracked
107
+ )
108
+
109
+ def external_edges(self, tracked_ids: set[str] | tuple[str, ...]) -> tuple[Edge, ...]:
110
+ tracked = set(tracked_ids)
111
+ return tuple(
112
+ edge
113
+ for edge in self.edges
114
+ if edge.from_id not in tracked or edge.to_id not in tracked
115
+ )
116
+
117
+
118
+ def build_dependency_graph(extracted_files: dict[str, SourceFile]) -> DependencyGraph:
119
+ graph = DependencyGraph()
120
+
121
+ for file_path, extracted_file in extracted_files.items():
122
+ graph.add_node(file_path)
123
+ for imported_reference in extracted_file.imports:
124
+ graph.add_edge(file_path, imported_reference.normalized_path)
125
+
126
+ return graph
@@ -0,0 +1,35 @@
1
+ from pathlib import Path
2
+
3
+ from .dependency import build_dependency_graph
4
+ from .extractors import languages
5
+ from .code import CodeMap, QueryableCodeMap
6
+
7
+
8
+ class ModwireExtraction:
9
+ def __init__(self, root: Path):
10
+ self._root = root.resolve()
11
+
12
+ def discover(self) -> tuple[str, ...]:
13
+ discovered: list[str] = []
14
+ for language in languages.get_supported_languages():
15
+ extractor = languages.load_extractor(language)
16
+ if extractor.has_source_files(self._root):
17
+ discovered.append(language)
18
+ return tuple(discovered)
19
+
20
+ def generate_map(self, language: str) -> CodeMap:
21
+ available = languages.get_supported_languages()
22
+ if language not in available:
23
+ raise ValueError(f"Language is not supported: {language}")
24
+
25
+ extraction = languages.load_extractor(language).extract_source(self._root)
26
+ dependency_graph = build_dependency_graph(extraction.files)
27
+ return CodeMap(
28
+ language=language,
29
+ extraction=extraction,
30
+ dependency_graph=dependency_graph,
31
+ )
32
+
33
+ def generate_queryable_map(self, language: str) -> QueryableCodeMap:
34
+ code_map = self.generate_map(language)
35
+ return QueryableCodeMap(code_map=code_map)
@@ -0,0 +1,2 @@
1
+ from .languages import load_extractor
2
+
@@ -0,0 +1,7 @@
1
+ from .loader import load_extractor, get_supported_languages
2
+
3
+
4
+ __all__ = [
5
+ "load_extractor",
6
+ "get_supported_languages",
7
+ ]
@@ -0,0 +1,219 @@
1
+ import abc
2
+ import json
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ from dataclasses import dataclass
7
+ from typing import Any, Literal, cast
8
+
9
+ from pathlib import Path
10
+
11
+ from pydantic import BaseModel, ConfigDict
12
+
13
+ from ..source import SourceFile
14
+
15
+
16
+ class SourceExtraction(BaseModel):
17
+ model_config = ConfigDict(frozen=True)
18
+
19
+ files: dict[str, SourceFile]
20
+ files_found: int
21
+ files_excluded: int
22
+
23
+ def files_dict(self) -> dict[str, SourceFile]:
24
+ return dict(self.files)
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class BatchConfig:
29
+ size: int = 500
30
+ parallel_threshold: int = 0
31
+ parallel_size: int = 0
32
+ max_workers: int = 1
33
+ output_format: Literal["json", "jsonl"] = "json"
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class ExtractorRuntime:
38
+ language: str
39
+ file_extensions: tuple[str, ...]
40
+ command: tuple[str, ...]
41
+ script_path: Path
42
+
43
+
44
+ class SourceExtractor(abc.ABC):
45
+ excluded_dir_names = frozenset(
46
+ {
47
+ ".git",
48
+ ".hg",
49
+ ".mypy_cache",
50
+ ".pytest_cache",
51
+ ".ruff_cache",
52
+ ".svn",
53
+ ".venv",
54
+ "__pycache__",
55
+ "build",
56
+ "coverage",
57
+ "dist",
58
+ "ignored",
59
+ "node_modules",
60
+ "vendor",
61
+ }
62
+ )
63
+
64
+ @property
65
+ @abc.abstractmethod
66
+ def runtime(self) -> ExtractorRuntime:
67
+ raise NotImplementedError
68
+
69
+ @property
70
+ @abc.abstractmethod
71
+ def batch_config(self) -> BatchConfig:
72
+ raise NotImplementedError
73
+
74
+ def has_source_files(self, root: Path) -> bool:
75
+ resolved_root = root.resolve()
76
+ if not resolved_root.is_dir():
77
+ raise ValueError(f"Source root is not a directory: {root}")
78
+
79
+ source_paths, _ = self._discover_source_files(resolved_root)
80
+ return bool(source_paths)
81
+
82
+ def extract_source(self, root: Path) -> SourceExtraction:
83
+ resolved_root = root.resolve()
84
+ if not resolved_root.is_dir():
85
+ raise ValueError(f"Source root is not a directory: {root}")
86
+
87
+ source_paths, files_excluded = self._discover_source_files(resolved_root)
88
+ files: dict[str, SourceFile] = {}
89
+ batch_size = max(1, self.batch_config.size)
90
+
91
+ for start in range(0, len(source_paths), batch_size):
92
+ batch_paths = source_paths[start : start + batch_size]
93
+ files.update(self._extract_batch(resolved_root, batch_paths))
94
+
95
+ return SourceExtraction(
96
+ files=files,
97
+ files_found=len(source_paths),
98
+ files_excluded=files_excluded,
99
+ )
100
+
101
+ def _discover_source_files(self, root: Path) -> tuple[list[Path], int]:
102
+ source_paths: list[Path] = []
103
+ files_excluded = 0
104
+ extensions = self.runtime.file_extensions
105
+
106
+ for current_root, dir_names, file_names in os.walk(root):
107
+ current_path = Path(current_root)
108
+ excluded_dirs = [
109
+ dir_name for dir_name in dir_names if self._is_excluded_dir(dir_name)
110
+ ]
111
+ files_excluded += sum(
112
+ self._count_source_files(current_path / dir_name)
113
+ for dir_name in excluded_dirs
114
+ )
115
+ dir_names[:] = [
116
+ dir_name for dir_name in dir_names if dir_name not in excluded_dirs
117
+ ]
118
+
119
+ for file_name in file_names:
120
+ file_path = current_path / file_name
121
+ if file_path.suffix.lower() in extensions:
122
+ source_paths.append(file_path.resolve())
123
+
124
+ return sorted(source_paths), files_excluded
125
+
126
+ def _count_source_files(self, root: Path) -> int:
127
+ count = 0
128
+ extensions = self.runtime.file_extensions
129
+ for current_root, dir_names, file_names in os.walk(root):
130
+ dir_names[:] = [
131
+ dir_name
132
+ for dir_name in dir_names
133
+ if not self._is_excluded_dir(dir_name)
134
+ ]
135
+ count += sum(
136
+ 1
137
+ for file_name in file_names
138
+ if (Path(current_root) / file_name).suffix.lower() in extensions
139
+ )
140
+ return count
141
+
142
+ def _is_excluded_dir(self, name: str) -> bool:
143
+ return name in self.excluded_dir_names or name.startswith(".")
144
+
145
+ def _extract_batch(self, root: Path, source_paths: list[Path]) -> dict[str, SourceFile]:
146
+ if not source_paths:
147
+ return {}
148
+
149
+ runtime = self.runtime
150
+ if not runtime.script_path.is_file():
151
+ raise RuntimeError(
152
+ f"{runtime.language} extractor script is missing: {runtime.script_path}"
153
+ )
154
+ executable = runtime.command[0]
155
+ if shutil.which(executable) is None:
156
+ raise RuntimeError(
157
+ f"{runtime.language} extractor runtime is not available on PATH: "
158
+ f"{executable}"
159
+ )
160
+
161
+ paths_by_source_id = {
162
+ self._source_id_for_path(root, source_path): str(source_path)
163
+ for source_path in source_paths
164
+ }
165
+ command = [
166
+ *runtime.command,
167
+ str(runtime.script_path),
168
+ "--batch",
169
+ str(root),
170
+ ]
171
+ if self.batch_config.output_format == "jsonl":
172
+ command.append("--jsonl")
173
+
174
+ result = subprocess.run(
175
+ command,
176
+ input=json.dumps(paths_by_source_id),
177
+ text=True,
178
+ capture_output=True,
179
+ check=False,
180
+ )
181
+ if result.returncode != 0:
182
+ message = result.stderr.strip() or result.stdout.strip()
183
+ raise RuntimeError(
184
+ f"{runtime.language} extractor failed with exit code "
185
+ f"{result.returncode}: {message}"
186
+ )
187
+
188
+ extracted = self._parse_batch_output(result.stdout)
189
+ return {
190
+ source_id: SourceFile.model_validate(source_file)
191
+ for source_id, source_file in extracted.items()
192
+ }
193
+
194
+ def _parse_batch_output(self, output: str) -> dict[str, Any]:
195
+ if self.batch_config.output_format == "jsonl":
196
+ result: dict[str, Any] = {}
197
+ for line in output.splitlines():
198
+ if not line.strip():
199
+ continue
200
+ item: Any = json.loads(line)
201
+ if not isinstance(item, list):
202
+ raise RuntimeError("Extractor returned invalid JSONL batch output.")
203
+ item_list = cast(list[Any], item)
204
+ if len(item_list) != 2:
205
+ raise RuntimeError("Extractor returned invalid JSONL batch output.")
206
+ source_id, source_file = item_list
207
+ if not isinstance(source_id, str):
208
+ raise RuntimeError("Extractor returned a non-string source id.")
209
+ result[source_id] = source_file
210
+ return result
211
+
212
+ parsed: Any = json.loads(output)
213
+ if not isinstance(parsed, dict):
214
+ raise RuntimeError("Extractor returned invalid JSON batch output.")
215
+ return cast(dict[str, Any], parsed)
216
+
217
+ def _source_id_for_path(self, root: Path, path: Path) -> str:
218
+ relative_path = path.relative_to(root)
219
+ return relative_path.with_suffix("").as_posix().strip("/")