component-mapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import time
5
+ from pathlib import Path
6
+ import aiohttp
7
+ import aiofiles
8
+ from component_mapper.config import RegistryConfig
9
+ from component_mapper.models import RegistrySource
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class RegistryFetcher:
15
+ def __init__(self, config: RegistryConfig):
16
+ self.config = config
17
+ self._memory_cache: dict[str, dict] = {}
18
+ self._cache_timestamps: dict[str, float] = {}
19
+ self._semaphore = asyncio.Semaphore(config.max_concurrent_fetches)
20
+ self._disk_cache_dir = Path(".cache/registry_http")
21
+ self._disk_cache_dir.mkdir(parents=True, exist_ok=True)
22
+
23
+ def _cache_key(self, name: str, source: RegistrySource) -> str:
24
+ return f"{source.value}:{name}"
25
+
26
+ def _disk_cache_path(self, name: str, source: RegistrySource) -> Path:
27
+ return self._disk_cache_dir / f"{source.value}_{name}.json"
28
+
29
+ def _is_cache_fresh(self, cache_key: str) -> bool:
30
+ ts = self._cache_timestamps.get(cache_key, 0)
31
+ ttl_seconds = self.config.http_cache_ttl_hours * 3600
32
+ return (time.time() - ts) < ttl_seconds
33
+
34
+ def _base_url(self, source: RegistrySource) -> str:
35
+ if source == RegistrySource.SHADCN:
36
+ return self.config.shadcn_registry_base_url
37
+ return (
38
+ self.config.custom_registry_base_url or self.config.shadcn_registry_base_url
39
+ )
40
+
41
+ async def fetch_component(
42
+ self,
43
+ name: str,
44
+ source: RegistrySource = RegistrySource.SHADCN,
45
+ ) -> dict:
46
+ cache_key = self._cache_key(name, source)
47
+
48
+ # Memory cache
49
+ if cache_key in self._memory_cache and self._is_cache_fresh(cache_key):
50
+ return self._memory_cache[cache_key]
51
+
52
+ # Disk cache
53
+ disk_path = self._disk_cache_path(name, source)
54
+ if disk_path.exists():
55
+ stat_age = time.time() - disk_path.stat().st_mtime
56
+ if stat_age < self.config.http_cache_ttl_hours * 3600:
57
+ try:
58
+ async with aiofiles.open(disk_path, "r") as f:
59
+ data = json.loads(await f.read())
60
+ self._memory_cache[cache_key] = data
61
+ self._cache_timestamps[cache_key] = time.time()
62
+ return data
63
+ except Exception:
64
+ pass
65
+
66
+ # HTTP fetch
67
+ async with self._semaphore:
68
+ url = f"{self._base_url(source)}/{name}.json"
69
+ try:
70
+ async with aiohttp.ClientSession() as session:
71
+ async with session.get(
72
+ url,
73
+ timeout=aiohttp.ClientTimeout(
74
+ total=self.config.fetch_timeout_seconds
75
+ ),
76
+ ) as resp:
77
+ if resp.status == 200:
78
+ data = await resp.json(content_type=None)
79
+ else:
80
+ logger.warning(
81
+ "Registry fetch %s returned %d", url, resp.status
82
+ )
83
+ data = {"name": name, "files": []}
84
+ except Exception as exc:
85
+ logger.warning("Registry fetch failed for %s: %s", name, exc)
86
+ data = {"name": name, "files": []}
87
+
88
+ self._memory_cache[cache_key] = data
89
+ self._cache_timestamps[cache_key] = time.time()
90
+
91
+ # Persist to disk
92
+ try:
93
+ async with aiofiles.open(disk_path, "w") as f:
94
+ await f.write(json.dumps(data))
95
+ except Exception:
96
+ pass
97
+
98
+ return data
99
+
100
+ async def fetch_many(
101
+ self,
102
+ names: list[str],
103
+ source: RegistrySource = RegistrySource.SHADCN,
104
+ ) -> dict[str, dict]:
105
+ """Concurrent fetch under semaphore. Returns name -> registry JSON."""
106
+ tasks = [self.fetch_component(n, source) for n in names]
107
+ results = await asyncio.gather(*tasks, return_exceptions=True)
108
+ return {
109
+ name: (r if isinstance(r, dict) else {"name": name, "files": []})
110
+ for name, r in zip(names, results)
111
+ }
112
+
113
+ async def fetch_source_code(self, name: str) -> str:
114
+ """Extract TypeScript source from registry JSON files array."""
115
+ data = await self.fetch_component(name, RegistrySource.SHADCN)
116
+ files = data.get("files", [])
117
+ if files:
118
+ return files[0].get("content", "")
119
+ return ""
120
+
121
+ async def fetch_from_external(
122
+ self,
123
+ url_template: str,
124
+ component_name: str,
125
+ registry_name: str,
126
+ ) -> dict:
127
+ """Fetch a component from an external registry using its URL template.
128
+
129
+ url_template uses {name} as the placeholder e.g.
130
+ "https://bundui.io/r/{name}.json" → "https://bundui.io/r/pagination.json"
131
+ """
132
+ cache_key = f"external:{registry_name}:{component_name}"
133
+
134
+ if cache_key in self._memory_cache and self._is_cache_fresh(cache_key):
135
+ return self._memory_cache[cache_key]
136
+
137
+ disk_path = self._disk_cache_dir / f"ext_{registry_name}_{component_name}.json"
138
+ if disk_path.exists():
139
+ stat_age = time.time() - disk_path.stat().st_mtime
140
+ if stat_age < self.config.http_cache_ttl_hours * 3600:
141
+ try:
142
+ async with aiofiles.open(disk_path, "r") as f:
143
+ data = json.loads(await f.read())
144
+ self._memory_cache[cache_key] = data
145
+ self._cache_timestamps[cache_key] = time.time()
146
+ return data
147
+ except Exception:
148
+ pass
149
+
150
+ url = url_template.replace("{name}", component_name)
151
+ async with self._semaphore:
152
+ try:
153
+ async with aiohttp.ClientSession() as session:
154
+ async with session.get(
155
+ url,
156
+ timeout=aiohttp.ClientTimeout(
157
+ total=self.config.fetch_timeout_seconds
158
+ ),
159
+ ) as resp:
160
+ if resp.status == 200:
161
+ data = await resp.json(content_type=None)
162
+ logger.debug(
163
+ "Fetched %s/%s from external registry",
164
+ registry_name, component_name,
165
+ )
166
+ else:
167
+ logger.warning(
168
+ "External registry %s returned %d for %s",
169
+ registry_name, resp.status, component_name,
170
+ )
171
+ data = {"name": component_name, "files": []}
172
+ except Exception as exc:
173
+ logger.warning(
174
+ "External registry fetch failed %s/%s: %s",
175
+ registry_name, component_name, exc,
176
+ )
177
+ data = {"name": component_name, "files": []}
178
+
179
+ self._memory_cache[cache_key] = data
180
+ self._cache_timestamps[cache_key] = time.time()
181
+ try:
182
+ async with aiofiles.open(disk_path, "w") as f:
183
+ await f.write(json.dumps(data))
184
+ except Exception:
185
+ pass
186
+
187
+ return data
188
+
189
+ async def fetch_all_external(
190
+ self,
191
+ external_registries: list,
192
+ ) -> dict[str, dict]:
193
+ """Fetch all components from all external registries concurrently.
194
+
195
+ Returns dict keyed by "registry_name/component_name".
196
+ """
197
+ tasks = {}
198
+ for reg in external_registries:
199
+ if not reg.open_source:
200
+ continue
201
+ for comp in reg.components:
202
+ key = f"{reg.name}/{comp}"
203
+ tasks[key] = self.fetch_from_external(
204
+ reg.url_template, comp, reg.name
205
+ )
206
+
207
+ if not tasks:
208
+ return {}
209
+
210
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
211
+ return {
212
+ key: (r if isinstance(r, dict) else {"name": key, "files": []})
213
+ for key, r in zip(tasks.keys(), results)
214
+ }
@@ -0,0 +1,159 @@
1
+ from enum import Enum
2
+ from typing import Any
3
+ from pydantic import BaseModel, Field
4
+ from segment_classifier.models import ClassifiedSegment, ComponentType
5
+
6
+
7
+ class MappingStage(str, Enum):
8
+ CACHE_HIT = "cache_hit"
9
+ STRUCTURAL_MATCH = "structural_match"
10
+ LLM_MAPPED = "llm_mapped"
11
+ LLM_NOVEL = "llm_novel"
12
+ UNRESOLVED = "unresolved"
13
+
14
+
15
+ class InteractivityMode(str, Enum):
16
+ STATIC = "static"
17
+ INTERACTIVE = "interactive"
18
+ PARTIAL = "partial"
19
+
20
+
21
+ class RegistrySource(str, Enum):
22
+ SHADCN = "shadcn"
23
+ CUSTOM = "custom"
24
+ NOVEL = "novel"
25
+
26
+
27
+ class PropDefinition(BaseModel):
28
+ name: str
29
+ type: str
30
+ required: bool = False
31
+ default_value: str | None = None
32
+ description: str = ""
33
+
34
+
35
+ class ComponentSignature(BaseModel):
36
+ component_name: str
37
+ registry_source: RegistrySource
38
+
39
+ dom_skeleton: str
40
+ root_element: str
41
+ required_children: list[str]
42
+ optional_children: list[str]
43
+ structural_class_tokens: list[str]
44
+ typical_nesting_depth: int
45
+ child_tag_counts: dict[str, int]
46
+ unique_tag_count: int
47
+
48
+ compatible_component_types: list[ComponentType]
49
+ interactivity: InteractivityMode = InteractivityMode.STATIC
50
+ description: str = ""
51
+
52
+ props: list[PropDefinition] = Field(default_factory=list)
53
+
54
+ astro_import: str
55
+ install_command: str
56
+ requires_client_directive: bool = False
57
+
58
+
59
+ class CustomComponentDefinition(BaseModel):
60
+ name: str
61
+ dom_skeleton: str
62
+ structural_class_tokens: list[str]
63
+ compatible_component_types: list[ComponentType]
64
+ props: list[PropDefinition]
65
+ astro_import: str
66
+ install_command: str = ""
67
+ interactivity: InteractivityMode = InteractivityMode.STATIC
68
+ description: str = ""
69
+ source: str = "manual"
70
+ confidence: float = 1.0
71
+
72
+
73
+ class RankedCandidate(BaseModel):
74
+ component_name: str
75
+ registry_source: RegistrySource
76
+ signature: ComponentSignature
77
+
78
+ structural_score: float = Field(ge=0.0, le=1.0)
79
+ type_score: float = Field(ge=0.0, le=1.0)
80
+ class_token_score: float = Field(ge=0.0, le=1.0)
81
+ composite_score: float = Field(ge=0.0, le=1.0)
82
+
83
+
84
+ class PropMapping(BaseModel):
85
+ mappings: list[dict[str, Any]] = Field(default_factory=list)
86
+ has_ambiguous: bool = False
87
+ unmapped_props: list[str] = Field(default_factory=list)
88
+
89
+
90
+ class AstroImport(BaseModel):
91
+ identifier: str
92
+ source: str
93
+ is_default: bool = False
94
+
95
+
96
+ class AstroComponent(BaseModel):
97
+ component_name: str
98
+ file_path: str
99
+ frontmatter: str
100
+ template: str
101
+ imports: list[AstroImport]
102
+ full_file_content: str
103
+ install_commands: list[str]
104
+ client_directive: str | None = None
105
+ is_collection_item: bool = False
106
+
107
+
108
+ class ContentCollectionSchema(BaseModel):
109
+ collection_name: str
110
+ zod_schema: str
111
+ example_entry: str
112
+
113
+
114
+ class MappingCacheRecord(BaseModel):
115
+ fingerprint_hash: str
116
+ component_name: str
117
+ registry_source: RegistrySource
118
+ prop_mapping: PropMapping
119
+ mapping_stage: MappingStage
120
+ confidence: float
121
+ hit_count: int = 1
122
+
123
+
124
+ class MappedComponent(BaseModel):
125
+ segment_id: str
126
+ page_url: str
127
+ component_type: ComponentType
128
+ classification_stage: str
129
+
130
+ component_name: str
131
+ registry_source: RegistrySource
132
+ mapping_stage: MappingStage
133
+ mapping_confidence: float
134
+
135
+ prop_mapping: PropMapping
136
+
137
+ astro_component: AstroComponent
138
+ content_collection_schema: ContentCollectionSchema | None = None
139
+
140
+ llm_model_used: str | None = None
141
+ llm_reasoning: str | None = None
142
+
143
+
144
+ class PipelineRunResult(BaseModel):
145
+ total_segments: int
146
+ mapped: list[MappedComponent]
147
+ unresolved: list[ClassifiedSegment]
148
+
149
+ stage_breakdown: dict[MappingStage, int]
150
+
151
+ llm_calls_made: int
152
+ llm_model_usage: dict[str, int]
153
+ mcp_calls_made: int
154
+
155
+ cache_hit_rate: float
156
+ structural_match_rate: float
157
+
158
+ install_commands: list[str]
159
+ unique_components_used: list[str]
@@ -0,0 +1,182 @@
1
+ import asyncio
2
+ import logging
3
+ from component_mapper.config import MapperSettings
4
+ from component_mapper.models import (
5
+ PipelineRunResult,
6
+ MappingStage,
7
+ )
8
+ from component_mapper.mcp.official_client import OfficialMCPClient
9
+ from component_mapper.mcp.registry_fetcher import RegistryFetcher
10
+ from component_mapper.registry.signature_index import SignatureIndex
11
+ from component_mapper.registry.custom_registry import CustomRegistry
12
+ from component_mapper.cache.mapping_cache import MappingCache
13
+ from component_mapper.stages.cache_lookup import CacheLookupStage
14
+ from component_mapper.stages.structural_match import StructuralMatchStage
15
+ from component_mapper.stages.llm_mapper import LLMMapperStage
16
+ from component_mapper.stages.astro_stage import AstroStage
17
+ from segment_classifier.models import ClassifiedSegment
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class MapperPipeline:
23
+ def __init__(self, settings: MapperSettings):
24
+ self.settings = settings
25
+ self.mcp_client = OfficialMCPClient(settings.mcp)
26
+ self.fetcher = RegistryFetcher(settings.registry)
27
+ self.custom_registry = CustomRegistry()
28
+ self.mapping_cache = MappingCache(
29
+ settings.mapping_cache.cache_path,
30
+ settings.mapping_cache.auto_persist_every,
31
+ )
32
+ self.signature_index = SignatureIndex(settings, self.fetcher, self.mcp_client)
33
+
34
+ self.cache_stage = CacheLookupStage(self.mapping_cache, self.signature_index)
35
+ self.structural_stage = StructuralMatchStage(
36
+ self.signature_index, self.mapping_cache
37
+ )
38
+ self.llm_stage = LLMMapperStage(
39
+ settings, self.signature_index, self.custom_registry, self.mapping_cache
40
+ )
41
+ self.astro_stage = AstroStage(settings, self.signature_index)
42
+
43
+ async def initialize(self) -> None:
44
+ """One-time setup. Call before run()."""
45
+ logger.info("Initializing MapperPipeline")
46
+
47
+ # Connect MCP (non-fatal if unavailable)
48
+ await self.mcp_client.connect()
49
+
50
+ # Load caches
51
+ await asyncio.gather(
52
+ self.mapping_cache.load(),
53
+ self.custom_registry.load(
54
+ self.settings.signature_index.custom_registry_path
55
+ ),
56
+ )
57
+
58
+ # Build signature index (uses cache if fresh, else rebuilds)
59
+ await self.signature_index.build()
60
+
61
+ # Merge custom components into index (priority: custom first)
62
+ self.signature_index.merge_custom(self.custom_registry.get_all())
63
+
64
+ logger.info(
65
+ "Pipeline initialized: %d components in index, %d mapping cache records",
66
+ len(self.signature_index.get_all_component_names()),
67
+ self.mapping_cache.size,
68
+ )
69
+
70
+ async def run(self, segments: list[ClassifiedSegment]) -> PipelineRunResult:
71
+ """Map all segments. Returns PipelineRunResult."""
72
+ if not segments:
73
+ return PipelineRunResult(
74
+ total_segments=0,
75
+ mapped=[],
76
+ unresolved=[],
77
+ stage_breakdown={s: 0 for s in MappingStage},
78
+ llm_calls_made=0,
79
+ llm_model_usage={},
80
+ mcp_calls_made=self.mcp_client.calls_made,
81
+ cache_hit_rate=0.0,
82
+ structural_match_rate=0.0,
83
+ install_commands=[],
84
+ unique_components_used=[],
85
+ )
86
+
87
+ total = len(segments)
88
+ logger.info("Starting pipeline run: %d segments", total)
89
+
90
+ # Stage 1: Cache lookup
91
+ hits, misses = await self.cache_stage.process(segments)
92
+
93
+ # Stage 2: Structural match
94
+ direct_matches, ambiguous, novel = await self.structural_stage.process(misses)
95
+
96
+ # Stage 3: LLM mapping
97
+ llm_mapped, unresolved = await self.llm_stage.process(ambiguous, novel)
98
+
99
+ # Stage 4: Astro generation
100
+ all_mapped = hits + direct_matches + llm_mapped
101
+ all_mapped = await self.astro_stage.process(all_mapped)
102
+
103
+ # Collect install manifest
104
+ install_set: set[str] = set()
105
+ component_names: set[str] = set()
106
+ for comp in all_mapped:
107
+ if comp.astro_component:
108
+ for cmd in comp.astro_component.install_commands:
109
+ if cmd:
110
+ install_set.add(cmd)
111
+ component_names.add(comp.component_name)
112
+
113
+ # Stage breakdown
114
+ breakdown: dict[MappingStage, int] = {s: 0 for s in MappingStage}
115
+ for comp in all_mapped:
116
+ breakdown[comp.mapping_stage] = breakdown.get(comp.mapping_stage, 0) + 1
117
+ breakdown[MappingStage.UNRESOLVED] = len(unresolved)
118
+
119
+ miss_count = len(misses)
120
+ structural_match_rate = len(direct_matches) / miss_count if miss_count else 0.0
121
+
122
+ logger.info(
123
+ "Pipeline complete: %d mapped, %d unresolved | "
124
+ "cache=%.0f%% structural=%.0f%% llm=%d calls",
125
+ len(all_mapped),
126
+ len(unresolved),
127
+ (len(hits) / total * 100) if total else 0,
128
+ structural_match_rate * 100,
129
+ self.llm_stage.calls_made,
130
+ )
131
+
132
+ return PipelineRunResult(
133
+ total_segments=total,
134
+ mapped=all_mapped,
135
+ unresolved=unresolved,
136
+ stage_breakdown=breakdown,
137
+ llm_calls_made=self.llm_stage.calls_made,
138
+ llm_model_usage=self.llm_stage.get_model_usage(),
139
+ mcp_calls_made=self.mcp_client.calls_made,
140
+ cache_hit_rate=len(hits) / total if total else 0.0,
141
+ structural_match_rate=structural_match_rate,
142
+ install_commands=sorted(install_set),
143
+ unique_components_used=sorted(component_names),
144
+ )
145
+
146
+ async def shutdown(self) -> None:
147
+ """Persist state and install components. Call after run()."""
148
+ logger.info("Shutting down pipeline")
149
+
150
+ # Persist caches
151
+ await asyncio.gather(
152
+ self.mapping_cache.persist(),
153
+ self.custom_registry.persist(
154
+ self.settings.signature_index.custom_registry_path
155
+ ),
156
+ )
157
+
158
+ # Install unique shadcn components via MCP
159
+ unique_shadcn = [
160
+ name
161
+ for name in self.signature_index.get_all_component_names()
162
+ if (
163
+ self.signature_index.get_signature(name) is not None
164
+ and self.signature_index.get_signature(name).registry_source.value
165
+ == "shadcn"
166
+ )
167
+ ]
168
+
169
+ if unique_shadcn and self.mcp_client._connected:
170
+ try:
171
+ results = await self.mcp_client.install_components(unique_shadcn)
172
+ success_count = sum(1 for v in results.values() if v)
173
+ logger.info(
174
+ "Installed %d/%d shadcn components",
175
+ success_count,
176
+ len(unique_shadcn),
177
+ )
178
+ except Exception as exc:
179
+ logger.warning("Component install failed: %s", exc)
180
+
181
+ await self.mcp_client.disconnect()
182
+ logger.info("Pipeline shutdown complete")
File without changes