codealmanac 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. codealmanac/__init__.py +13 -0
  2. codealmanac/app.py +175 -0
  3. codealmanac/cli/__init__.py +1 -0
  4. codealmanac/cli/dispatch/__init__.py +0 -0
  5. codealmanac/cli/dispatch/admin.py +124 -0
  6. codealmanac/cli/dispatch/config.py +50 -0
  7. codealmanac/cli/dispatch/root.py +328 -0
  8. codealmanac/cli/main.py +28 -0
  9. codealmanac/cli/parser/__init__.py +0 -0
  10. codealmanac/cli/parser/admin.py +81 -0
  11. codealmanac/cli/parser/lifecycle.py +57 -0
  12. codealmanac/cli/parser/root.py +19 -0
  13. codealmanac/cli/parser/wiki.py +87 -0
  14. codealmanac/cli/render/__init__.py +0 -0
  15. codealmanac/cli/render/admin.py +191 -0
  16. codealmanac/cli/render/root.py +290 -0
  17. codealmanac/core/__init__.py +1 -0
  18. codealmanac/core/errors.py +45 -0
  19. codealmanac/core/models.py +14 -0
  20. codealmanac/core/paths.py +25 -0
  21. codealmanac/core/slug.py +7 -0
  22. codealmanac/core/text.py +5 -0
  23. codealmanac/database/__init__.py +15 -0
  24. codealmanac/database/sqlite.py +54 -0
  25. codealmanac/integrations/__init__.py +1 -0
  26. codealmanac/integrations/automation/__init__.py +3 -0
  27. codealmanac/integrations/automation/scheduler/__init__.py +5 -0
  28. codealmanac/integrations/automation/scheduler/launchd.py +163 -0
  29. codealmanac/integrations/command.py +56 -0
  30. codealmanac/integrations/harnesses/__init__.py +7 -0
  31. codealmanac/integrations/harnesses/claude/__init__.py +1 -0
  32. codealmanac/integrations/harnesses/claude/adapter.py +217 -0
  33. codealmanac/integrations/harnesses/codex/__init__.py +3 -0
  34. codealmanac/integrations/harnesses/codex/adapter.py +221 -0
  35. codealmanac/integrations/harnesses/git_status.py +49 -0
  36. codealmanac/integrations/sources/__init__.py +29 -0
  37. codealmanac/integrations/sources/filesystem/__init__.py +5 -0
  38. codealmanac/integrations/sources/filesystem/adapter.py +685 -0
  39. codealmanac/integrations/sources/filesystem/selection.py +209 -0
  40. codealmanac/integrations/sources/git/__init__.py +3 -0
  41. codealmanac/integrations/sources/git/adapter.py +132 -0
  42. codealmanac/integrations/sources/github/__init__.py +3 -0
  43. codealmanac/integrations/sources/github/adapter.py +413 -0
  44. codealmanac/integrations/sources/runtime.py +22 -0
  45. codealmanac/integrations/sources/transcripts/__init__.py +33 -0
  46. codealmanac/integrations/sources/transcripts/claude.py +61 -0
  47. codealmanac/integrations/sources/transcripts/codex.py +69 -0
  48. codealmanac/integrations/sources/transcripts/jsonl.py +84 -0
  49. codealmanac/integrations/sources/transcripts/runtime.py +387 -0
  50. codealmanac/integrations/sources/web/__init__.py +3 -0
  51. codealmanac/integrations/sources/web/adapter.py +303 -0
  52. codealmanac/integrations/updates/__init__.py +7 -0
  53. codealmanac/integrations/updates/package.py +85 -0
  54. codealmanac/integrations/workspaces/__init__.py +1 -0
  55. codealmanac/integrations/workspaces/git/__init__.py +3 -0
  56. codealmanac/integrations/workspaces/git/probe.py +128 -0
  57. codealmanac/manual/README.md +24 -0
  58. codealmanac/manual/__init__.py +19 -0
  59. codealmanac/manual/build.md +20 -0
  60. codealmanac/manual/evidence.md +23 -0
  61. codealmanac/manual/garden.md +20 -0
  62. codealmanac/manual/ingest.md +17 -0
  63. codealmanac/manual/library.py +84 -0
  64. codealmanac/manual/models.py +83 -0
  65. codealmanac/manual/pages.md +28 -0
  66. codealmanac/manual/requests.py +6 -0
  67. codealmanac/manual/sources.md +18 -0
  68. codealmanac/manual/style.md +19 -0
  69. codealmanac/prompts/__init__.py +5 -0
  70. codealmanac/prompts/base/notability.md +14 -0
  71. codealmanac/prompts/base/purpose.md +23 -0
  72. codealmanac/prompts/base/syntax.md +19 -0
  73. codealmanac/prompts/models.py +9 -0
  74. codealmanac/prompts/operations/garden.md +26 -0
  75. codealmanac/prompts/operations/ingest.md +18 -0
  76. codealmanac/prompts/renderer.py +24 -0
  77. codealmanac/prompts/requests.py +22 -0
  78. codealmanac/server/__init__.py +1 -0
  79. codealmanac/server/app.py +202 -0
  80. codealmanac/server/assets/__init__.py +1 -0
  81. codealmanac/server/assets/app.css +865 -0
  82. codealmanac/server/assets/app.js +3 -0
  83. codealmanac/server/assets/index.html +80 -0
  84. codealmanac/server/assets/viewer/api.js +30 -0
  85. codealmanac/server/assets/viewer/components.js +197 -0
  86. codealmanac/server/assets/viewer/main.js +126 -0
  87. codealmanac/server/assets/viewer/renderers.js +122 -0
  88. codealmanac/server/assets/viewer/routes.js +36 -0
  89. codealmanac/services/__init__.py +1 -0
  90. codealmanac/services/automation/__init__.py +3 -0
  91. codealmanac/services/automation/models.py +83 -0
  92. codealmanac/services/automation/ports.py +14 -0
  93. codealmanac/services/automation/requests.py +40 -0
  94. codealmanac/services/automation/service.py +294 -0
  95. codealmanac/services/config/__init__.py +17 -0
  96. codealmanac/services/config/models.py +61 -0
  97. codealmanac/services/config/requests.py +21 -0
  98. codealmanac/services/config/service.py +55 -0
  99. codealmanac/services/config/store.py +26 -0
  100. codealmanac/services/diagnostics/__init__.py +1 -0
  101. codealmanac/services/diagnostics/models.py +22 -0
  102. codealmanac/services/diagnostics/requests.py +8 -0
  103. codealmanac/services/diagnostics/service.py +283 -0
  104. codealmanac/services/harnesses/__init__.py +1 -0
  105. codealmanac/services/harnesses/models.py +104 -0
  106. codealmanac/services/harnesses/ports.py +18 -0
  107. codealmanac/services/harnesses/requests.py +19 -0
  108. codealmanac/services/harnesses/service.py +38 -0
  109. codealmanac/services/health/__init__.py +1 -0
  110. codealmanac/services/health/requests.py +8 -0
  111. codealmanac/services/health/service.py +20 -0
  112. codealmanac/services/index/__init__.py +1 -0
  113. codealmanac/services/index/models.py +135 -0
  114. codealmanac/services/index/requests.py +26 -0
  115. codealmanac/services/index/service.py +86 -0
  116. codealmanac/services/index/store.py +411 -0
  117. codealmanac/services/index/views.py +524 -0
  118. codealmanac/services/pages/__init__.py +1 -0
  119. codealmanac/services/pages/requests.py +17 -0
  120. codealmanac/services/pages/service.py +26 -0
  121. codealmanac/services/runs/__init__.py +1 -0
  122. codealmanac/services/runs/models.py +91 -0
  123. codealmanac/services/runs/requests.py +76 -0
  124. codealmanac/services/runs/service.py +86 -0
  125. codealmanac/services/runs/store.py +256 -0
  126. codealmanac/services/search/__init__.py +1 -0
  127. codealmanac/services/search/requests.py +23 -0
  128. codealmanac/services/search/service.py +31 -0
  129. codealmanac/services/sources/__init__.py +1 -0
  130. codealmanac/services/sources/models.py +126 -0
  131. codealmanac/services/sources/ports.py +30 -0
  132. codealmanac/services/sources/requests.py +76 -0
  133. codealmanac/services/sources/service.py +351 -0
  134. codealmanac/services/tagging/__init__.py +1 -0
  135. codealmanac/services/tagging/models.py +9 -0
  136. codealmanac/services/tagging/requests.py +35 -0
  137. codealmanac/services/tagging/service.py +43 -0
  138. codealmanac/services/topics/__init__.py +1 -0
  139. codealmanac/services/topics/models.py +36 -0
  140. codealmanac/services/topics/requests.py +115 -0
  141. codealmanac/services/topics/service.py +297 -0
  142. codealmanac/services/updates/__init__.py +4 -0
  143. codealmanac/services/updates/models.py +83 -0
  144. codealmanac/services/updates/ports.py +17 -0
  145. codealmanac/services/updates/requests.py +10 -0
  146. codealmanac/services/updates/service.py +113 -0
  147. codealmanac/services/viewer/__init__.py +1 -0
  148. codealmanac/services/viewer/models.py +80 -0
  149. codealmanac/services/viewer/renderer.py +89 -0
  150. codealmanac/services/viewer/requests.py +86 -0
  151. codealmanac/services/viewer/service.py +211 -0
  152. codealmanac/services/wiki/__init__.py +1 -0
  153. codealmanac/services/wiki/documents.py +83 -0
  154. codealmanac/services/wiki/frontmatter.py +94 -0
  155. codealmanac/services/wiki/frontmatter_rewrite.py +142 -0
  156. codealmanac/services/wiki/models.py +69 -0
  157. codealmanac/services/wiki/paths.py +42 -0
  158. codealmanac/services/wiki/service.py +57 -0
  159. codealmanac/services/wiki/templates.py +73 -0
  160. codealmanac/services/wiki/topics.py +266 -0
  161. codealmanac/services/wiki/wikilinks.py +58 -0
  162. codealmanac/services/workspaces/__init__.py +1 -0
  163. codealmanac/services/workspaces/models.py +124 -0
  164. codealmanac/services/workspaces/ports.py +9 -0
  165. codealmanac/services/workspaces/requests.py +82 -0
  166. codealmanac/services/workspaces/roots.py +74 -0
  167. codealmanac/services/workspaces/service.py +303 -0
  168. codealmanac/services/workspaces/store.py +127 -0
  169. codealmanac/workflows/__init__.py +1 -0
  170. codealmanac/workflows/build/__init__.py +1 -0
  171. codealmanac/workflows/build/models.py +8 -0
  172. codealmanac/workflows/build/service.py +45 -0
  173. codealmanac/workflows/garden/__init__.py +3 -0
  174. codealmanac/workflows/garden/models.py +30 -0
  175. codealmanac/workflows/garden/requests.py +22 -0
  176. codealmanac/workflows/garden/service.py +239 -0
  177. codealmanac/workflows/ingest/__init__.py +1 -0
  178. codealmanac/workflows/ingest/models.py +26 -0
  179. codealmanac/workflows/ingest/requests.py +39 -0
  180. codealmanac/workflows/ingest/service.py +302 -0
  181. codealmanac/workflows/lifecycle.py +197 -0
  182. codealmanac/workflows/sync/__init__.py +3 -0
  183. codealmanac/workflows/sync/models.py +157 -0
  184. codealmanac/workflows/sync/requests.py +63 -0
  185. codealmanac/workflows/sync/service.py +651 -0
  186. codealmanac/workflows/sync/store.py +51 -0
  187. codealmanac-0.1.0.dev0.dist-info/METADATA +248 -0
  188. codealmanac-0.1.0.dev0.dist-info/RECORD +192 -0
  189. codealmanac-0.1.0.dev0.dist-info/WHEEL +5 -0
  190. codealmanac-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  191. codealmanac-0.1.0.dev0.dist-info/licenses/LICENSE.md +201 -0
  192. codealmanac-0.1.0.dev0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,387 @@
1
+ import json
2
+ from collections.abc import Iterator
3
+ from enum import StrEnum
4
+ from pathlib import Path
5
+
6
+ import jsonlines
7
+ from pydantic import Field, JsonValue, ValidationError, field_validator
8
+
9
+ from codealmanac.core.models import CodeAlmanacModel
10
+ from codealmanac.core.paths import normalize_path
11
+ from codealmanac.core.text import required_text
12
+ from codealmanac.integrations.sources.runtime import source_runtime_section
13
+ from codealmanac.services.sources.models import (
14
+ SourceKind,
15
+ SourceRef,
16
+ SourceRuntime,
17
+ SourceRuntimeStatus,
18
+ )
19
+ from codealmanac.services.sources.requests import InspectSourceRuntimeRequest
20
+
21
+ DEFAULT_MAX_CHARS = 60_000
22
+
23
+
24
+ class TranscriptRuntimeLineKind(StrEnum):
25
+ META = "meta"
26
+ MESSAGE = "message"
27
+ TOOL_CALL = "tool_call"
28
+ TOOL_RESULT = "tool_result"
29
+ EVENT = "event"
30
+ RAW = "raw"
31
+
32
+
33
+ class TranscriptRuntimeEntry(CodeAlmanacModel):
34
+ line_number: int
35
+ kind: TranscriptRuntimeLineKind
36
+ label: str
37
+ text: str
38
+
39
+ @field_validator("line_number")
40
+ @classmethod
41
+ def positive_line_number(cls, value: int) -> int:
42
+ if value < 1:
43
+ raise ValueError("line number must be positive")
44
+ return value
45
+
46
+ @field_validator("label", "text")
47
+ @classmethod
48
+ def require_text(cls, value: str) -> str:
49
+ return required_text(value, "transcript runtime entry")
50
+
51
+
52
+ class TranscriptJsonLine(CodeAlmanacModel):
53
+ # External provider JSON is intentionally kept as JsonValue at this first
54
+ # boundary; helpers below validate known sub-shapes before reading fields.
55
+ type: str | None = None
56
+ timestamp: str | None = None
57
+ session_id: str | None = Field(default=None, alias="sessionId")
58
+ cwd: str | None = None
59
+ payload: JsonValue | None = None
60
+ message: JsonValue | None = None
61
+
62
+
63
+ class TranscriptPayload(CodeAlmanacModel):
64
+ id: str | None = None
65
+ cwd: str | None = None
66
+ thread_source: str | None = None
67
+ message: str | None = None
68
+ item: JsonValue | None = None
69
+
70
+
71
+ class TranscriptMessage(CodeAlmanacModel):
72
+ role: str | None = None
73
+ content: JsonValue | None = None
74
+
75
+
76
+ class TranscriptItem(CodeAlmanacModel):
77
+ type: str | None = None
78
+ role: str | None = None
79
+ name: str | None = None
80
+ call_id: str | None = None
81
+ content: JsonValue | None = None
82
+ arguments: JsonValue | None = None
83
+ output: JsonValue | None = None
84
+
85
+
86
+ class TranscriptSourceRuntimeAdapter:
87
+ def __init__(self, max_chars: int = DEFAULT_MAX_CHARS):
88
+ self.max_chars = max_chars
89
+
90
+ def supports(self, ref: SourceRef) -> bool:
91
+ return ref.kind == SourceKind.TRANSCRIPT
92
+
93
+ def inspect(self, request: InspectSourceRuntimeRequest) -> SourceRuntime:
94
+ if request.ref.kind != SourceKind.TRANSCRIPT:
95
+ return SourceRuntime(
96
+ ref=request.ref,
97
+ status=SourceRuntimeStatus.SKIPPED,
98
+ title=f"Unsupported transcript source {request.ref.identity}",
99
+ )
100
+ path = transcript_path(request.cwd, request.ref)
101
+ if path is None:
102
+ return unavailable_runtime(
103
+ request.ref,
104
+ "Transcript unavailable",
105
+ "transcript source requires a path",
106
+ )
107
+ if not path.is_file():
108
+ return unavailable_runtime(
109
+ request.ref,
110
+ "Transcript unavailable",
111
+ f"transcript file not found: {path}",
112
+ )
113
+ entries = tuple(read_transcript_entries(path))
114
+ if len(entries) == 0:
115
+ return unavailable_runtime(
116
+ request.ref,
117
+ f"Transcript {path}",
118
+ "no readable JSONL objects found",
119
+ )
120
+ body = "\n".join(render_entry(entry) for entry in entries)
121
+ content, truncated = bounded_tail_text(
122
+ "\n\n".join(
123
+ (
124
+ source_runtime_section(
125
+ "metadata",
126
+ f"path: {path}\nreadable_entries: {len(entries)}",
127
+ ),
128
+ source_runtime_section("transcript", body),
129
+ )
130
+ ),
131
+ self.max_chars,
132
+ )
133
+ return SourceRuntime(
134
+ ref=request.ref,
135
+ status=SourceRuntimeStatus.AVAILABLE,
136
+ title=f"Transcript {path}",
137
+ content=content,
138
+ truncated=truncated,
139
+ )
140
+
141
+
142
+ def transcript_path(cwd: Path, ref: SourceRef) -> Path | None:
143
+ if ref.transcript is None or ref.transcript.strip() == "":
144
+ return None
145
+ path = Path(ref.transcript).expanduser()
146
+ if not path.is_absolute():
147
+ path = cwd / path
148
+ return normalize_path(path)
149
+
150
+
151
+ def unavailable_runtime(ref: SourceRef, title: str, diagnostic: str) -> SourceRuntime:
152
+ return SourceRuntime(
153
+ ref=ref,
154
+ status=SourceRuntimeStatus.UNAVAILABLE,
155
+ title=title,
156
+ diagnostics=(diagnostic,),
157
+ )
158
+
159
+
160
+ def read_transcript_entries(path: Path) -> Iterator[TranscriptRuntimeEntry]:
161
+ with path.open("r", encoding="utf-8") as file:
162
+ for line_number, line in enumerate(file, start=1):
163
+ parsed = read_jsonl_object(line)
164
+ if parsed is None:
165
+ continue
166
+ yield transcript_entry(line_number, parsed)
167
+
168
+
169
+ def read_jsonl_object(line: str) -> dict[str, object] | None:
170
+ reader = jsonlines.Reader([line])
171
+ return next(reader.iter(type=dict, skip_empty=True, skip_invalid=True), None)
172
+
173
+
174
+ def transcript_entry(
175
+ line_number: int,
176
+ parsed: dict[str, object],
177
+ ) -> TranscriptRuntimeEntry:
178
+ try:
179
+ line = TranscriptJsonLine.model_validate(parsed)
180
+ except ValidationError:
181
+ return runtime_entry(
182
+ line_number,
183
+ TranscriptRuntimeLineKind.RAW,
184
+ "raw",
185
+ compact_json(parsed),
186
+ )
187
+ payload = parse_payload(line.payload)
188
+ if payload is not None:
189
+ entry = entry_from_payload(line_number, line, payload)
190
+ if entry is not None:
191
+ return entry
192
+ message = parse_message(line.message)
193
+ if message is not None:
194
+ role = message.role or line.type or "message"
195
+ return runtime_entry(
196
+ line_number,
197
+ TranscriptRuntimeLineKind.MESSAGE,
198
+ label_with_timestamp(role, line.timestamp),
199
+ line_message_text(line, message),
200
+ )
201
+ if line.session_id is not None or line.cwd is not None:
202
+ text = "\n".join(
203
+ part
204
+ for part in (
205
+ f"session_id: {line.session_id}" if line.session_id else "",
206
+ f"cwd: {line.cwd}" if line.cwd else "",
207
+ )
208
+ if part
209
+ )
210
+ return runtime_entry(
211
+ line_number,
212
+ TranscriptRuntimeLineKind.META,
213
+ label_with_timestamp(line.type or "meta", line.timestamp),
214
+ text,
215
+ )
216
+ return runtime_entry(
217
+ line_number,
218
+ TranscriptRuntimeLineKind.RAW,
219
+ label_with_timestamp(line.type or "raw", line.timestamp),
220
+ compact_json(parsed),
221
+ )
222
+
223
+
224
+ def entry_from_payload(
225
+ line_number: int,
226
+ line: TranscriptJsonLine,
227
+ payload: TranscriptPayload,
228
+ ) -> TranscriptRuntimeEntry | None:
229
+ if payload.id is not None or payload.cwd is not None:
230
+ text = "\n".join(
231
+ part
232
+ for part in (
233
+ f"id: {payload.id}" if payload.id else "",
234
+ f"cwd: {payload.cwd}" if payload.cwd else "",
235
+ f"thread_source: {payload.thread_source}"
236
+ if payload.thread_source
237
+ else "",
238
+ )
239
+ if part
240
+ )
241
+ return runtime_entry(
242
+ line_number,
243
+ TranscriptRuntimeLineKind.META,
244
+ label_with_timestamp(line.type or "payload", line.timestamp),
245
+ text,
246
+ )
247
+ if payload.message is not None:
248
+ return runtime_entry(
249
+ line_number,
250
+ TranscriptRuntimeLineKind.EVENT,
251
+ label_with_timestamp(line.type or "event", line.timestamp),
252
+ payload.message,
253
+ )
254
+ item = parse_item(payload.item)
255
+ if item is None:
256
+ return None
257
+ return entry_from_item(line_number, line, item)
258
+
259
+
260
+ def entry_from_item(
261
+ line_number: int,
262
+ line: TranscriptJsonLine,
263
+ item: TranscriptItem,
264
+ ) -> TranscriptRuntimeEntry:
265
+ if item.type in {"function_call", "tool_call"} or item.name is not None:
266
+ return runtime_entry(
267
+ line_number,
268
+ TranscriptRuntimeLineKind.TOOL_CALL,
269
+ label_with_timestamp(f"tool_call {item.name or 'unknown'}", line.timestamp),
270
+ render_json_text(item.arguments),
271
+ )
272
+ if item.type in {"function_call_output", "tool_result"} or item.output is not None:
273
+ return runtime_entry(
274
+ line_number,
275
+ TranscriptRuntimeLineKind.TOOL_RESULT,
276
+ label_with_timestamp("tool_result", line.timestamp),
277
+ render_json_text(item.output or item.content),
278
+ )
279
+ role = item.role or item.type or "item"
280
+ return runtime_entry(
281
+ line_number,
282
+ TranscriptRuntimeLineKind.MESSAGE,
283
+ label_with_timestamp(role, line.timestamp),
284
+ render_json_text(item.content),
285
+ )
286
+
287
+
288
+ def parse_payload(value: JsonValue | None) -> TranscriptPayload | None:
289
+ if not isinstance(value, dict):
290
+ return None
291
+ try:
292
+ return TranscriptPayload.model_validate(value)
293
+ except ValidationError:
294
+ return None
295
+
296
+
297
+ def parse_message(value: JsonValue | None) -> TranscriptMessage | None:
298
+ if not isinstance(value, dict):
299
+ return None
300
+ try:
301
+ return TranscriptMessage.model_validate(value)
302
+ except ValidationError:
303
+ return None
304
+
305
+
306
+ def parse_item(value: JsonValue | None) -> TranscriptItem | None:
307
+ if not isinstance(value, dict):
308
+ return None
309
+ try:
310
+ return TranscriptItem.model_validate(value)
311
+ except ValidationError:
312
+ return None
313
+
314
+
315
+ def line_message_text(line: TranscriptJsonLine, message: TranscriptMessage) -> str:
316
+ parts = [
317
+ part
318
+ for part in (
319
+ f"session_id: {line.session_id}" if line.session_id else "",
320
+ f"cwd: {line.cwd}" if line.cwd else "",
321
+ render_json_text(message.content),
322
+ )
323
+ if part
324
+ ]
325
+ return "\n".join(parts)
326
+
327
+
328
+ def runtime_entry(
329
+ line_number: int,
330
+ kind: TranscriptRuntimeLineKind,
331
+ label: str,
332
+ text: str,
333
+ ) -> TranscriptRuntimeEntry:
334
+ rendered = text.strip()
335
+ if rendered == "":
336
+ rendered = "(empty)"
337
+ return TranscriptRuntimeEntry(
338
+ line_number=line_number,
339
+ kind=kind,
340
+ label=label,
341
+ text=rendered,
342
+ )
343
+
344
+
345
+ def label_with_timestamp(label: str, timestamp: str | None) -> str:
346
+ if timestamp is None or timestamp.strip() == "":
347
+ return label
348
+ return f"{label} {timestamp}"
349
+
350
+
351
+ def render_entry(entry: TranscriptRuntimeEntry) -> str:
352
+ return f"L{entry.line_number:04d} [{entry.kind.value}] {entry.label}: {entry.text}"
353
+
354
+
355
+ def render_json_text(value: JsonValue | None) -> str:
356
+ if value is None:
357
+ return ""
358
+ if isinstance(value, str):
359
+ return value
360
+ if isinstance(value, list):
361
+ return "\n".join(
362
+ part for part in (render_json_text(item) for item in value) if part
363
+ )
364
+ if isinstance(value, dict):
365
+ text = value.get("text")
366
+ if isinstance(text, str):
367
+ return text
368
+ content = value.get("content")
369
+ if content is not None:
370
+ return render_json_text(content)
371
+ return compact_json(value)
372
+ return compact_json(value)
373
+
374
+
375
+ def compact_json(value: JsonValue) -> str:
376
+ return json.dumps(value, sort_keys=True, separators=(",", ":"))
377
+
378
+
379
+ def bounded_tail_text(value: str, max_chars: int) -> tuple[str, bool]:
380
+ if len(value) <= max_chars:
381
+ return value, False
382
+ marker = "[truncated earlier transcript lines]\n\n"
383
+ tail = value[-max_chars:]
384
+ first_newline = tail.find("\n")
385
+ if first_newline != -1:
386
+ tail = tail[first_newline + 1 :]
387
+ return marker + tail, True
@@ -0,0 +1,3 @@
1
+ from codealmanac.integrations.sources.web.adapter import WebSourceRuntimeAdapter
2
+
3
+ __all__ = ["WebSourceRuntimeAdapter"]
@@ -0,0 +1,303 @@
1
+ from enum import StrEnum
2
+
3
+ import httpx
4
+ from bs4 import BeautifulSoup
5
+ from pydantic import field_validator
6
+
7
+ from codealmanac.core.models import CodeAlmanacModel
8
+ from codealmanac.core.text import required_text
9
+ from codealmanac.integrations.sources.runtime import (
10
+ bounded_text,
11
+ source_runtime_section,
12
+ )
13
+ from codealmanac.services.sources.models import (
14
+ SourceKind,
15
+ SourceRef,
16
+ SourceRuntime,
17
+ SourceRuntimeStatus,
18
+ )
19
+ from codealmanac.services.sources.requests import InspectSourceRuntimeRequest
20
+
21
+ WEB_RUNTIME_TIMEOUT_SECONDS = 20
22
+ DEFAULT_MAX_BYTES = 2_000_000
23
+ DEFAULT_MAX_CHARS = 60_000
24
+
25
+
26
+ class WebContentKind(StrEnum):
27
+ HTML = "html"
28
+ TEXT = "text"
29
+
30
+
31
+ class FetchedWebResponse(CodeAlmanacModel):
32
+ final_url: str
33
+ status_code: int
34
+ content_type: str
35
+ body: bytes
36
+ response_truncated: bool = False
37
+
38
+ @field_validator("final_url", "content_type")
39
+ @classmethod
40
+ def require_text_fields(cls, value: str) -> str:
41
+ return required_text(value, "web response")
42
+
43
+ @field_validator("status_code")
44
+ @classmethod
45
+ def validate_status_code(cls, value: int) -> int:
46
+ if value < 100 or value > 599:
47
+ raise ValueError("HTTP status code must be between 100 and 599")
48
+ return value
49
+
50
+
51
+ class WebRuntimeDocument(CodeAlmanacModel):
52
+ final_url: str
53
+ status_code: int
54
+ content_type: str
55
+ content_kind: WebContentKind
56
+ body: str
57
+ title: str | None = None
58
+ response_truncated: bool = False
59
+
60
+ @field_validator("final_url", "content_type", "body")
61
+ @classmethod
62
+ def require_text_fields(cls, value: str) -> str:
63
+ return required_text(value, "web runtime document")
64
+
65
+
66
+ class UnsupportedWebContentError(Exception):
67
+ pass
68
+
69
+
70
+ class WebSourceRuntimeAdapter:
71
+ def __init__(
72
+ self,
73
+ client: httpx.Client | None = None,
74
+ max_bytes: int = DEFAULT_MAX_BYTES,
75
+ max_chars: int = DEFAULT_MAX_CHARS,
76
+ timeout_seconds: int = WEB_RUNTIME_TIMEOUT_SECONDS,
77
+ ):
78
+ self.client = client
79
+ self.max_bytes = max_bytes
80
+ self.max_chars = max_chars
81
+ self.timeout_seconds = timeout_seconds
82
+
83
+ def supports(self, ref: SourceRef) -> bool:
84
+ return ref.kind == SourceKind.WEB_URL
85
+
86
+ def inspect(self, request: InspectSourceRuntimeRequest) -> SourceRuntime:
87
+ if request.ref.kind != SourceKind.WEB_URL:
88
+ return SourceRuntime(
89
+ ref=request.ref,
90
+ status=SourceRuntimeStatus.SKIPPED,
91
+ title=f"Unsupported web source {request.ref.identity}",
92
+ )
93
+ if request.ref.url is None:
94
+ return unavailable_runtime(
95
+ request.ref,
96
+ "Web URL unavailable",
97
+ "web source requires a URL",
98
+ )
99
+ try:
100
+ response = self._fetch(request.ref.url)
101
+ document = parse_web_response(response)
102
+ except (httpx.HTTPError, UnsupportedWebContentError, ValueError) as error:
103
+ return unavailable_runtime(
104
+ request.ref,
105
+ "Web URL unavailable",
106
+ first_error_line(error),
107
+ )
108
+
109
+ content, truncated = bounded_text(
110
+ "\n\n".join(
111
+ (
112
+ source_runtime_section("metadata", render_metadata(document)),
113
+ source_runtime_section("content", document.body),
114
+ )
115
+ ),
116
+ self.max_chars,
117
+ )
118
+ title_suffix = f": {document.title}" if document.title is not None else ""
119
+ return SourceRuntime(
120
+ ref=request.ref,
121
+ status=SourceRuntimeStatus.AVAILABLE,
122
+ title=f"Web URL {document.final_url}{title_suffix}",
123
+ content=content,
124
+ truncated=truncated or document.response_truncated,
125
+ )
126
+
127
+ def _fetch(self, url: str) -> FetchedWebResponse:
128
+ if self.client is not None:
129
+ return fetch_with_client(
130
+ self.client,
131
+ url,
132
+ self.max_bytes,
133
+ self.timeout_seconds,
134
+ )
135
+ with httpx.Client(
136
+ follow_redirects=True,
137
+ timeout=self.timeout_seconds,
138
+ ) as client:
139
+ return fetch_with_client(
140
+ client,
141
+ url,
142
+ self.max_bytes,
143
+ self.timeout_seconds,
144
+ )
145
+
146
+
147
+ def fetch_with_client(
148
+ client: httpx.Client,
149
+ url: str,
150
+ max_bytes: int,
151
+ timeout_seconds: int,
152
+ ) -> FetchedWebResponse:
153
+ with client.stream(
154
+ "GET",
155
+ url,
156
+ follow_redirects=True,
157
+ timeout=timeout_seconds,
158
+ ) as response:
159
+ response.raise_for_status()
160
+ body, truncated = read_bounded_response(response, max_bytes)
161
+ content_type = response.headers.get("content-type", "").strip()
162
+ return FetchedWebResponse(
163
+ final_url=str(response.url),
164
+ status_code=response.status_code,
165
+ content_type=content_type or "(none)",
166
+ body=body,
167
+ response_truncated=truncated,
168
+ )
169
+
170
+
171
+ def read_bounded_response(
172
+ response: httpx.Response,
173
+ max_bytes: int,
174
+ ) -> tuple[bytes, bool]:
175
+ chunks: list[bytes] = []
176
+ total = 0
177
+ truncated = False
178
+ for chunk in response.iter_bytes():
179
+ remaining = max_bytes - total
180
+ if remaining <= 0:
181
+ truncated = True
182
+ break
183
+ if len(chunk) > remaining:
184
+ chunks.append(chunk[:remaining])
185
+ truncated = True
186
+ break
187
+ chunks.append(chunk)
188
+ total += len(chunk)
189
+ return b"".join(chunks), truncated
190
+
191
+
192
+ def parse_web_response(response: FetchedWebResponse) -> WebRuntimeDocument:
193
+ content_kind = content_kind_for(response.content_type)
194
+ text = decode_body(response.body)
195
+ if content_kind == WebContentKind.HTML:
196
+ return parse_html_document(response, text)
197
+ return WebRuntimeDocument(
198
+ final_url=response.final_url,
199
+ status_code=response.status_code,
200
+ content_type=response.content_type,
201
+ content_kind=content_kind,
202
+ body=normalized_text(text),
203
+ response_truncated=response.response_truncated,
204
+ )
205
+
206
+
207
+ def content_kind_for(content_type: str) -> WebContentKind:
208
+ media_type = content_type.split(";", 1)[0].strip().casefold()
209
+ if media_type in {"text/html", "application/xhtml+xml", "(none)"}:
210
+ return WebContentKind.HTML
211
+ if media_type.startswith("text/"):
212
+ return WebContentKind.TEXT
213
+ if media_type in {
214
+ "application/json",
215
+ "application/ld+json",
216
+ "application/xml",
217
+ "application/rss+xml",
218
+ "application/atom+xml",
219
+ }:
220
+ return WebContentKind.TEXT
221
+ raise UnsupportedWebContentError(f"unsupported web content type: {media_type}")
222
+
223
+
224
+ def parse_html_document(response: FetchedWebResponse, html: str) -> WebRuntimeDocument:
225
+ soup = BeautifulSoup(html, "html.parser")
226
+ for element in soup(("script", "style", "noscript", "template", "svg")):
227
+ element.decompose()
228
+ title = title_text(soup)
229
+ body = normalized_text(soup.get_text("\n"))
230
+ if body == "":
231
+ body = "(empty page text)"
232
+ return WebRuntimeDocument(
233
+ final_url=response.final_url,
234
+ status_code=response.status_code,
235
+ content_type=response.content_type,
236
+ content_kind=WebContentKind.HTML,
237
+ title=title,
238
+ body=body,
239
+ response_truncated=response.response_truncated,
240
+ )
241
+
242
+
243
+ def decode_body(body: bytes) -> str:
244
+ return body.decode("utf-8", errors="replace")
245
+
246
+
247
+ def title_text(soup: BeautifulSoup) -> str | None:
248
+ if soup.title is None:
249
+ return None
250
+ title = normalized_text(soup.title.get_text(" "))
251
+ if title == "":
252
+ return None
253
+ return title
254
+
255
+
256
+ def normalized_text(value: str) -> str:
257
+ lines: list[str] = []
258
+ previous_blank = False
259
+ for raw_line in value.splitlines():
260
+ line = " ".join(raw_line.strip().split())
261
+ if line == "":
262
+ if lines and not previous_blank:
263
+ lines.append("")
264
+ previous_blank = True
265
+ continue
266
+ lines.append(line)
267
+ previous_blank = False
268
+ while lines and lines[-1] == "":
269
+ lines.pop()
270
+ return "\n".join(lines)
271
+
272
+
273
+ def render_metadata(document: WebRuntimeDocument) -> str:
274
+ lines = [
275
+ f"final_url: {document.final_url}",
276
+ f"status_code: {document.status_code}",
277
+ f"content_type: {document.content_type}",
278
+ f"content_kind: {document.content_kind.value}",
279
+ f"response_truncated: {str(document.response_truncated).lower()}",
280
+ ]
281
+ if document.title is not None:
282
+ lines.append(f"title: {document.title}")
283
+ return "\n".join(lines)
284
+
285
+
286
+ def unavailable_runtime(
287
+ ref: SourceRef,
288
+ title: str,
289
+ diagnostic: str,
290
+ ) -> SourceRuntime:
291
+ return SourceRuntime(
292
+ ref=ref,
293
+ status=SourceRuntimeStatus.UNAVAILABLE,
294
+ title=title,
295
+ diagnostics=(diagnostic,),
296
+ )
297
+
298
+
299
+ def first_error_line(error: Exception) -> str:
300
+ lines = [line.strip() for line in str(error).splitlines() if line.strip()]
301
+ if len(lines) == 0:
302
+ return error.__class__.__name__
303
+ return lines[0]
@@ -0,0 +1,7 @@
1
+ from codealmanac.integrations.updates.package import (
2
+ InstalledPackageMetadataProvider,
3
+ SubprocessPackageCommandRunner,
4
+ )
5
+
6
+ __all__ = ["InstalledPackageMetadataProvider", "SubprocessPackageCommandRunner"]
7
+