svc-infra 0.1.595__py3-none-any.whl → 0.1.706__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of svc-infra might be problematic. Click here for more details.

Files changed (256) hide show
  1. svc_infra/__init__.py +58 -2
  2. svc_infra/apf_payments/models.py +133 -42
  3. svc_infra/apf_payments/provider/aiydan.py +121 -47
  4. svc_infra/apf_payments/provider/base.py +30 -9
  5. svc_infra/apf_payments/provider/stripe.py +156 -62
  6. svc_infra/apf_payments/schemas.py +18 -9
  7. svc_infra/apf_payments/service.py +98 -41
  8. svc_infra/apf_payments/settings.py +5 -1
  9. svc_infra/api/__init__.py +61 -0
  10. svc_infra/api/fastapi/__init__.py +15 -0
  11. svc_infra/api/fastapi/admin/__init__.py +3 -0
  12. svc_infra/api/fastapi/admin/add.py +245 -0
  13. svc_infra/api/fastapi/apf_payments/router.py +128 -70
  14. svc_infra/api/fastapi/apf_payments/setup.py +13 -6
  15. svc_infra/api/fastapi/auth/__init__.py +65 -0
  16. svc_infra/api/fastapi/auth/_cookies.py +6 -2
  17. svc_infra/api/fastapi/auth/add.py +17 -14
  18. svc_infra/api/fastapi/auth/gaurd.py +45 -16
  19. svc_infra/api/fastapi/auth/mfa/models.py +3 -1
  20. svc_infra/api/fastapi/auth/mfa/pre_auth.py +10 -6
  21. svc_infra/api/fastapi/auth/mfa/router.py +15 -8
  22. svc_infra/api/fastapi/auth/mfa/security.py +1 -2
  23. svc_infra/api/fastapi/auth/mfa/utils.py +2 -1
  24. svc_infra/api/fastapi/auth/mfa/verify.py +9 -2
  25. svc_infra/api/fastapi/auth/policy.py +0 -1
  26. svc_infra/api/fastapi/auth/providers.py +3 -1
  27. svc_infra/api/fastapi/auth/routers/apikey_router.py +6 -6
  28. svc_infra/api/fastapi/auth/routers/oauth_router.py +146 -52
  29. svc_infra/api/fastapi/auth/routers/session_router.py +6 -2
  30. svc_infra/api/fastapi/auth/security.py +31 -10
  31. svc_infra/api/fastapi/auth/sender.py +8 -1
  32. svc_infra/api/fastapi/auth/state.py +3 -1
  33. svc_infra/api/fastapi/auth/ws_security.py +275 -0
  34. svc_infra/api/fastapi/billing/router.py +73 -0
  35. svc_infra/api/fastapi/billing/setup.py +19 -0
  36. svc_infra/api/fastapi/cache/add.py +9 -5
  37. svc_infra/api/fastapi/db/__init__.py +5 -1
  38. svc_infra/api/fastapi/db/http.py +3 -1
  39. svc_infra/api/fastapi/db/nosql/__init__.py +39 -1
  40. svc_infra/api/fastapi/db/nosql/mongo/add.py +47 -32
  41. svc_infra/api/fastapi/db/nosql/mongo/crud_router.py +30 -11
  42. svc_infra/api/fastapi/db/sql/__init__.py +5 -1
  43. svc_infra/api/fastapi/db/sql/add.py +71 -26
  44. svc_infra/api/fastapi/db/sql/crud_router.py +210 -22
  45. svc_infra/api/fastapi/db/sql/health.py +3 -1
  46. svc_infra/api/fastapi/db/sql/session.py +18 -0
  47. svc_infra/api/fastapi/db/sql/users.py +18 -6
  48. svc_infra/api/fastapi/dependencies/ratelimit.py +78 -14
  49. svc_infra/api/fastapi/docs/add.py +173 -0
  50. svc_infra/api/fastapi/docs/landing.py +4 -2
  51. svc_infra/api/fastapi/docs/scoped.py +62 -15
  52. svc_infra/api/fastapi/dual/__init__.py +12 -2
  53. svc_infra/api/fastapi/dual/dualize.py +1 -1
  54. svc_infra/api/fastapi/dual/protected.py +126 -4
  55. svc_infra/api/fastapi/dual/public.py +25 -0
  56. svc_infra/api/fastapi/dual/router.py +40 -13
  57. svc_infra/api/fastapi/dx.py +33 -2
  58. svc_infra/api/fastapi/ease.py +10 -2
  59. svc_infra/api/fastapi/http/concurrency.py +2 -1
  60. svc_infra/api/fastapi/http/conditional.py +3 -1
  61. svc_infra/api/fastapi/middleware/debug.py +4 -1
  62. svc_infra/api/fastapi/middleware/errors/catchall.py +6 -2
  63. svc_infra/api/fastapi/middleware/errors/exceptions.py +1 -1
  64. svc_infra/api/fastapi/middleware/errors/handlers.py +54 -8
  65. svc_infra/api/fastapi/middleware/graceful_shutdown.py +104 -0
  66. svc_infra/api/fastapi/middleware/idempotency.py +197 -70
  67. svc_infra/api/fastapi/middleware/idempotency_store.py +187 -0
  68. svc_infra/api/fastapi/middleware/optimistic_lock.py +42 -0
  69. svc_infra/api/fastapi/middleware/ratelimit.py +125 -28
  70. svc_infra/api/fastapi/middleware/ratelimit_store.py +43 -10
  71. svc_infra/api/fastapi/middleware/request_id.py +27 -11
  72. svc_infra/api/fastapi/middleware/request_size_limit.py +3 -3
  73. svc_infra/api/fastapi/middleware/timeout.py +177 -0
  74. svc_infra/api/fastapi/openapi/apply.py +5 -3
  75. svc_infra/api/fastapi/openapi/conventions.py +9 -2
  76. svc_infra/api/fastapi/openapi/mutators.py +165 -20
  77. svc_infra/api/fastapi/openapi/pipeline.py +1 -1
  78. svc_infra/api/fastapi/openapi/security.py +3 -1
  79. svc_infra/api/fastapi/ops/add.py +75 -0
  80. svc_infra/api/fastapi/pagination.py +47 -20
  81. svc_infra/api/fastapi/routers/__init__.py +43 -15
  82. svc_infra/api/fastapi/routers/ping.py +1 -0
  83. svc_infra/api/fastapi/setup.py +188 -57
  84. svc_infra/api/fastapi/tenancy/add.py +19 -0
  85. svc_infra/api/fastapi/tenancy/context.py +112 -0
  86. svc_infra/api/fastapi/versioned.py +101 -0
  87. svc_infra/app/README.md +5 -5
  88. svc_infra/app/__init__.py +3 -1
  89. svc_infra/app/env.py +69 -1
  90. svc_infra/app/logging/add.py +9 -2
  91. svc_infra/app/logging/formats.py +12 -5
  92. svc_infra/billing/__init__.py +23 -0
  93. svc_infra/billing/async_service.py +147 -0
  94. svc_infra/billing/jobs.py +241 -0
  95. svc_infra/billing/models.py +177 -0
  96. svc_infra/billing/quotas.py +103 -0
  97. svc_infra/billing/schemas.py +36 -0
  98. svc_infra/billing/service.py +123 -0
  99. svc_infra/bundled_docs/README.md +5 -0
  100. svc_infra/bundled_docs/__init__.py +1 -0
  101. svc_infra/bundled_docs/getting-started.md +6 -0
  102. svc_infra/cache/__init__.py +9 -0
  103. svc_infra/cache/add.py +170 -0
  104. svc_infra/cache/backend.py +7 -6
  105. svc_infra/cache/decorators.py +81 -15
  106. svc_infra/cache/demo.py +2 -2
  107. svc_infra/cache/keys.py +24 -4
  108. svc_infra/cache/recache.py +26 -14
  109. svc_infra/cache/resources.py +14 -5
  110. svc_infra/cache/tags.py +19 -44
  111. svc_infra/cache/utils.py +3 -1
  112. svc_infra/cli/__init__.py +52 -8
  113. svc_infra/cli/__main__.py +4 -0
  114. svc_infra/cli/cmds/__init__.py +39 -2
  115. svc_infra/cli/cmds/db/nosql/mongo/mongo_cmds.py +7 -4
  116. svc_infra/cli/cmds/db/nosql/mongo/mongo_scaffold_cmds.py +7 -5
  117. svc_infra/cli/cmds/db/ops_cmds.py +270 -0
  118. svc_infra/cli/cmds/db/sql/alembic_cmds.py +103 -18
  119. svc_infra/cli/cmds/db/sql/sql_export_cmds.py +88 -0
  120. svc_infra/cli/cmds/db/sql/sql_scaffold_cmds.py +3 -3
  121. svc_infra/cli/cmds/docs/docs_cmds.py +142 -0
  122. svc_infra/cli/cmds/dx/__init__.py +12 -0
  123. svc_infra/cli/cmds/dx/dx_cmds.py +116 -0
  124. svc_infra/cli/cmds/health/__init__.py +179 -0
  125. svc_infra/cli/cmds/health/health_cmds.py +8 -0
  126. svc_infra/cli/cmds/help.py +4 -0
  127. svc_infra/cli/cmds/jobs/__init__.py +1 -0
  128. svc_infra/cli/cmds/jobs/jobs_cmds.py +47 -0
  129. svc_infra/cli/cmds/obs/obs_cmds.py +36 -15
  130. svc_infra/cli/cmds/sdk/__init__.py +0 -0
  131. svc_infra/cli/cmds/sdk/sdk_cmds.py +112 -0
  132. svc_infra/cli/foundation/runner.py +6 -2
  133. svc_infra/data/add.py +61 -0
  134. svc_infra/data/backup.py +58 -0
  135. svc_infra/data/erasure.py +45 -0
  136. svc_infra/data/fixtures.py +42 -0
  137. svc_infra/data/retention.py +61 -0
  138. svc_infra/db/__init__.py +15 -0
  139. svc_infra/db/crud_schema.py +9 -9
  140. svc_infra/db/inbox.py +67 -0
  141. svc_infra/db/nosql/__init__.py +3 -0
  142. svc_infra/db/nosql/core.py +30 -9
  143. svc_infra/db/nosql/indexes.py +3 -1
  144. svc_infra/db/nosql/management.py +1 -1
  145. svc_infra/db/nosql/mongo/README.md +13 -13
  146. svc_infra/db/nosql/mongo/client.py +19 -2
  147. svc_infra/db/nosql/mongo/settings.py +6 -2
  148. svc_infra/db/nosql/repository.py +35 -15
  149. svc_infra/db/nosql/resource.py +20 -3
  150. svc_infra/db/nosql/scaffold.py +9 -3
  151. svc_infra/db/nosql/service.py +3 -1
  152. svc_infra/db/nosql/types.py +6 -2
  153. svc_infra/db/ops.py +384 -0
  154. svc_infra/db/outbox.py +108 -0
  155. svc_infra/db/sql/apikey.py +37 -9
  156. svc_infra/db/sql/authref.py +9 -3
  157. svc_infra/db/sql/constants.py +12 -8
  158. svc_infra/db/sql/core.py +2 -2
  159. svc_infra/db/sql/management.py +11 -8
  160. svc_infra/db/sql/repository.py +99 -26
  161. svc_infra/db/sql/resource.py +5 -0
  162. svc_infra/db/sql/scaffold.py +6 -2
  163. svc_infra/db/sql/service.py +15 -5
  164. svc_infra/db/sql/templates/models_schemas/auth/models.py.tmpl +7 -56
  165. svc_infra/db/sql/templates/setup/env_async.py.tmpl +34 -12
  166. svc_infra/db/sql/templates/setup/env_sync.py.tmpl +29 -7
  167. svc_infra/db/sql/tenant.py +88 -0
  168. svc_infra/db/sql/uniq_hooks.py +9 -3
  169. svc_infra/db/sql/utils.py +138 -51
  170. svc_infra/db/sql/versioning.py +14 -0
  171. svc_infra/deploy/__init__.py +538 -0
  172. svc_infra/documents/__init__.py +100 -0
  173. svc_infra/documents/add.py +264 -0
  174. svc_infra/documents/ease.py +233 -0
  175. svc_infra/documents/models.py +114 -0
  176. svc_infra/documents/storage.py +264 -0
  177. svc_infra/dx/add.py +65 -0
  178. svc_infra/dx/changelog.py +74 -0
  179. svc_infra/dx/checks.py +68 -0
  180. svc_infra/exceptions.py +141 -0
  181. svc_infra/health/__init__.py +864 -0
  182. svc_infra/http/__init__.py +13 -0
  183. svc_infra/http/client.py +105 -0
  184. svc_infra/jobs/builtins/outbox_processor.py +40 -0
  185. svc_infra/jobs/builtins/webhook_delivery.py +95 -0
  186. svc_infra/jobs/easy.py +33 -0
  187. svc_infra/jobs/loader.py +50 -0
  188. svc_infra/jobs/queue.py +116 -0
  189. svc_infra/jobs/redis_queue.py +256 -0
  190. svc_infra/jobs/runner.py +79 -0
  191. svc_infra/jobs/scheduler.py +53 -0
  192. svc_infra/jobs/worker.py +40 -0
  193. svc_infra/loaders/__init__.py +186 -0
  194. svc_infra/loaders/base.py +142 -0
  195. svc_infra/loaders/github.py +311 -0
  196. svc_infra/loaders/models.py +147 -0
  197. svc_infra/loaders/url.py +235 -0
  198. svc_infra/logging/__init__.py +374 -0
  199. svc_infra/mcp/svc_infra_mcp.py +91 -33
  200. svc_infra/obs/README.md +2 -0
  201. svc_infra/obs/add.py +65 -9
  202. svc_infra/obs/cloud_dash.py +2 -1
  203. svc_infra/obs/grafana/dashboards/http-overview.json +45 -0
  204. svc_infra/obs/metrics/__init__.py +3 -4
  205. svc_infra/obs/metrics/asgi.py +13 -7
  206. svc_infra/obs/metrics/http.py +9 -5
  207. svc_infra/obs/metrics/sqlalchemy.py +13 -9
  208. svc_infra/obs/metrics.py +6 -5
  209. svc_infra/obs/settings.py +6 -2
  210. svc_infra/security/add.py +217 -0
  211. svc_infra/security/audit.py +92 -10
  212. svc_infra/security/audit_service.py +4 -3
  213. svc_infra/security/headers.py +15 -2
  214. svc_infra/security/hibp.py +14 -4
  215. svc_infra/security/jwt_rotation.py +74 -22
  216. svc_infra/security/lockout.py +11 -5
  217. svc_infra/security/models.py +54 -12
  218. svc_infra/security/oauth_models.py +73 -0
  219. svc_infra/security/org_invites.py +5 -3
  220. svc_infra/security/passwords.py +3 -1
  221. svc_infra/security/permissions.py +25 -2
  222. svc_infra/security/session.py +1 -1
  223. svc_infra/security/signed_cookies.py +21 -1
  224. svc_infra/storage/__init__.py +93 -0
  225. svc_infra/storage/add.py +253 -0
  226. svc_infra/storage/backends/__init__.py +11 -0
  227. svc_infra/storage/backends/local.py +339 -0
  228. svc_infra/storage/backends/memory.py +216 -0
  229. svc_infra/storage/backends/s3.py +353 -0
  230. svc_infra/storage/base.py +239 -0
  231. svc_infra/storage/easy.py +185 -0
  232. svc_infra/storage/settings.py +195 -0
  233. svc_infra/testing/__init__.py +685 -0
  234. svc_infra/utils.py +7 -3
  235. svc_infra/webhooks/__init__.py +69 -0
  236. svc_infra/webhooks/add.py +339 -0
  237. svc_infra/webhooks/encryption.py +115 -0
  238. svc_infra/webhooks/fastapi.py +39 -0
  239. svc_infra/webhooks/router.py +55 -0
  240. svc_infra/webhooks/service.py +70 -0
  241. svc_infra/webhooks/signing.py +34 -0
  242. svc_infra/websocket/__init__.py +79 -0
  243. svc_infra/websocket/add.py +140 -0
  244. svc_infra/websocket/client.py +282 -0
  245. svc_infra/websocket/config.py +69 -0
  246. svc_infra/websocket/easy.py +76 -0
  247. svc_infra/websocket/exceptions.py +61 -0
  248. svc_infra/websocket/manager.py +344 -0
  249. svc_infra/websocket/models.py +49 -0
  250. svc_infra-0.1.706.dist-info/LICENSE +21 -0
  251. svc_infra-0.1.706.dist-info/METADATA +356 -0
  252. svc_infra-0.1.706.dist-info/RECORD +357 -0
  253. svc_infra-0.1.595.dist-info/METADATA +0 -80
  254. svc_infra-0.1.595.dist-info/RECORD +0 -253
  255. {svc_infra-0.1.595.dist-info → svc_infra-0.1.706.dist-info}/WHEEL +0 -0
  256. {svc_infra-0.1.595.dist-info → svc_infra-0.1.706.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,142 @@
1
+ """Base loader class for content loaders.
2
+
3
+ This module defines the abstract base class that all loaders must implement.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import asyncio
9
+ import logging
10
+ from abc import ABC, abstractmethod
11
+ from typing import TYPE_CHECKING, AsyncIterator, Literal
12
+
13
+ if TYPE_CHECKING:
14
+ from .models import LoadedContent
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ # Error handling strategies
20
+ ErrorStrategy = Literal["skip", "raise"]
21
+
22
+
23
+ class BaseLoader(ABC):
24
+ """Abstract base class for content loaders.
25
+
26
+ All loaders are async-first with sync wrappers provided for convenience.
27
+ Subclasses must implement the `load()` method.
28
+
29
+ Attributes:
30
+ on_error: How to handle errors during loading.
31
+ - "skip" (default): Log warning and skip failed items
32
+ - "raise": Raise exception on first failure
33
+
34
+ Example:
35
+ >>> class MyLoader(BaseLoader):
36
+ ... async def load(self) -> list[LoadedContent]:
37
+ ... # Implement loading logic
38
+ ... return [LoadedContent(content="...", source="...")]
39
+ >>>
40
+ >>> # Async usage (preferred)
41
+ >>> loader = MyLoader()
42
+ >>> contents = await loader.load()
43
+ >>>
44
+ >>> # Sync usage (convenience)
45
+ >>> contents = loader.load_sync()
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ on_error: ErrorStrategy = "skip",
51
+ ) -> None:
52
+ """Initialize the base loader.
53
+
54
+ Args:
55
+ on_error: Error handling strategy ("skip" or "raise").
56
+ """
57
+ self.on_error = on_error
58
+
59
+ @abstractmethod
60
+ async def load(self) -> list["LoadedContent"]:
61
+ """Load all content from the source.
62
+
63
+ This is the main method that subclasses must implement.
64
+
65
+ Returns:
66
+ List of LoadedContent objects.
67
+
68
+ Raises:
69
+ Various exceptions depending on the loader and error strategy.
70
+ """
71
+ ...
72
+
73
+ async def aiter(self) -> AsyncIterator["LoadedContent"]:
74
+ """Iterate over loaded content asynchronously.
75
+
76
+ This is useful for progress tracking or streaming large datasets.
77
+ Default implementation loads all content first, but subclasses
78
+ can override for true streaming.
79
+
80
+ Yields:
81
+ LoadedContent objects as they are loaded.
82
+
83
+ Example:
84
+ >>> async for content in loader.aiter():
85
+ ... print(f"Loaded: {content.source}")
86
+ ... process(content)
87
+ """
88
+ for content in await self.load():
89
+ yield content
90
+
91
+ def load_sync(self) -> list["LoadedContent"]:
92
+ """Synchronous wrapper for load().
93
+
94
+ Creates a new event loop if needed. Prefer the async version
95
+ when running in an async context.
96
+
97
+ Returns:
98
+ List of LoadedContent objects.
99
+
100
+ Example:
101
+ >>> # In a script or notebook
102
+ >>> loader = GitHubLoader("nfraxlab/svc-infra", path="docs")
103
+ >>> contents = loader.load_sync()
104
+ """
105
+ try:
106
+ # Check if we're already in an async context
107
+ loop = asyncio.get_running_loop()
108
+ except RuntimeError:
109
+ # No running loop - safe to use asyncio.run()
110
+ return asyncio.run(self.load())
111
+
112
+ # Already in async context - use nest_asyncio if available
113
+ try:
114
+ import nest_asyncio
115
+
116
+ nest_asyncio.apply()
117
+ return loop.run_until_complete(self.load())
118
+ except ImportError:
119
+ raise RuntimeError(
120
+ "Cannot call load_sync() from within an async context. "
121
+ "Use 'await loader.load()' instead, or install nest_asyncio: "
122
+ "pip install nest-asyncio"
123
+ )
124
+
125
+ def _handle_error(self, error: Exception, context: str) -> None:
126
+ """Handle an error according to the error strategy.
127
+
128
+ Args:
129
+ error: The exception that occurred.
130
+ context: Description of what was being done (for logging).
131
+
132
+ Raises:
133
+ The original exception if on_error="raise".
134
+ """
135
+ if self.on_error == "raise":
136
+ raise error
137
+ else:
138
+ logger.warning(f"Skipping {context}: {error}")
139
+
140
+ def __repr__(self) -> str:
141
+ """Return string representation of the loader."""
142
+ return f"{self.__class__.__name__}(on_error={self.on_error!r})"
@@ -0,0 +1,311 @@
1
+ """GitHub content loader.
2
+
3
+ Load files from GitHub repositories using the GitHub API.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import fnmatch
9
+ import logging
10
+ import os
11
+ from typing import Any
12
+
13
+ import httpx
14
+
15
+ from .base import BaseLoader, ErrorStrategy
16
+ from .models import LoadedContent
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class GitHubLoader(BaseLoader):
22
+ """Load files from a GitHub repository.
23
+
24
+ Fetches files matching a pattern from a GitHub repo using the GitHub API.
25
+ Supports public repos and private repos (with token).
26
+
27
+ Args:
28
+ repo: Repository in "owner/repo" format (e.g., "nfraxlab/svc-infra")
29
+ path: Path within repo to load from (e.g., "docs", "examples/src").
30
+ Empty string means repo root.
31
+ branch: Branch name (default: "main")
32
+ pattern: Glob pattern for files to include (default: "*.md").
33
+ Use "*" to match all files.
34
+ token: GitHub token for private repos or higher rate limits.
35
+ Falls back to GITHUB_TOKEN environment variable.
36
+ recursive: Whether to search subdirectories (default: True)
37
+ skip_patterns: List of patterns to skip. Default patterns are:
38
+ __pycache__, *.pyc, *.pyo, .git, node_modules, *.lock, .env*
39
+ extra_metadata: Additional metadata to attach to all loaded content.
40
+ on_error: How to handle errors ("skip" or "raise"). Default: "skip"
41
+
42
+ Example:
43
+ >>> # Load all markdown from docs/
44
+ >>> loader = GitHubLoader("nfraxlab/svc-infra", path="docs")
45
+ >>> contents = await loader.load()
46
+ >>> for c in contents:
47
+ ... print(f"Loaded: {c.source}")
48
+ >>>
49
+ >>> # Load Python files from examples
50
+ >>> loader = GitHubLoader(
51
+ ... "nfraxlab/svc-infra",
52
+ ... path="examples/src",
53
+ ... pattern="*.py",
54
+ ... skip_patterns=["__pycache__", "test_*"],
55
+ ... )
56
+ >>> contents = await loader.load()
57
+ >>>
58
+ >>> # Private repo with token
59
+ >>> loader = GitHubLoader(
60
+ ... "myorg/private-repo",
61
+ ... token="ghp_xxxx", # or set GITHUB_TOKEN env var
62
+ ... )
63
+ >>> contents = await loader.load()
64
+
65
+ Note:
66
+ - GitHub API rate limits: 60 requests/hour unauthenticated,
67
+ 5000 requests/hour with token
68
+ - Large repos may require multiple API calls (tree is fetched recursively)
69
+ - Binary files are automatically skipped
70
+ """
71
+
72
+ GITHUB_API = "https://api.github.com"
73
+ GITHUB_RAW = "https://raw.githubusercontent.com"
74
+
75
+ DEFAULT_SKIP_PATTERNS: list[str] = [
76
+ "__pycache__",
77
+ "*.pyc",
78
+ "*.pyo",
79
+ ".git",
80
+ ".github",
81
+ "node_modules",
82
+ "*.lock",
83
+ ".env*",
84
+ ".DS_Store",
85
+ "*.egg-info",
86
+ "dist",
87
+ "build",
88
+ "*.min.js",
89
+ "*.min.css",
90
+ ]
91
+
92
+ # Content types by extension
93
+ CONTENT_TYPES: dict[str, str] = {
94
+ "md": "text/markdown",
95
+ "py": "text/x-python",
96
+ "json": "application/json",
97
+ "yaml": "text/yaml",
98
+ "yml": "text/yaml",
99
+ "toml": "text/toml",
100
+ "sql": "text/x-sql",
101
+ "html": "text/html",
102
+ "css": "text/css",
103
+ "js": "text/javascript",
104
+ "ts": "text/typescript",
105
+ "tsx": "text/typescript",
106
+ "jsx": "text/javascript",
107
+ "txt": "text/plain",
108
+ "rst": "text/x-rst",
109
+ "ini": "text/plain",
110
+ "cfg": "text/plain",
111
+ "sh": "text/x-shellscript",
112
+ "bash": "text/x-shellscript",
113
+ "zsh": "text/x-shellscript",
114
+ }
115
+
116
+ def __init__(
117
+ self,
118
+ repo: str,
119
+ path: str = "",
120
+ branch: str = "main",
121
+ pattern: str = "*.md",
122
+ token: str | None = None,
123
+ recursive: bool = True,
124
+ skip_patterns: list[str] | None = None,
125
+ extra_metadata: dict[str, Any] | None = None,
126
+ on_error: ErrorStrategy = "skip",
127
+ ) -> None:
128
+ """Initialize the GitHub loader.
129
+
130
+ Args:
131
+ repo: Repository in "owner/repo" format
132
+ path: Path within repo (empty string for root)
133
+ branch: Branch name
134
+ pattern: Glob pattern for files to include
135
+ token: GitHub token (or use GITHUB_TOKEN env var)
136
+ recursive: Search subdirectories
137
+ skip_patterns: Patterns to skip (overrides defaults if provided)
138
+ extra_metadata: Additional metadata for all content
139
+ on_error: Error handling strategy
140
+ """
141
+ super().__init__(on_error=on_error)
142
+
143
+ # Validate repo format
144
+ if "/" not in repo or repo.count("/") != 1:
145
+ raise ValueError(
146
+ f"Invalid repo format: {repo!r}. Expected 'owner/repo' format."
147
+ )
148
+
149
+ self.repo = repo
150
+ self.path = path.strip("/")
151
+ self.branch = branch
152
+ self.pattern = pattern
153
+ self.token = token or os.getenv("GITHUB_TOKEN", "")
154
+ self.recursive = recursive
155
+ self.skip_patterns = (
156
+ skip_patterns if skip_patterns is not None else self.DEFAULT_SKIP_PATTERNS
157
+ )
158
+ self.extra_metadata = extra_metadata or {}
159
+
160
+ def _get_headers(self) -> dict[str, str]:
161
+ """Get headers for GitHub API requests."""
162
+ headers = {
163
+ "Accept": "application/vnd.github.v3+json",
164
+ "User-Agent": "svc-infra-loader",
165
+ }
166
+ if self.token:
167
+ headers["Authorization"] = f"token {self.token}"
168
+ return headers
169
+
170
+ def _should_skip(self, file_path: str) -> bool:
171
+ """Check if file should be skipped based on patterns."""
172
+ # Check each component of the path against skip patterns
173
+ parts = file_path.split("/")
174
+ for part in parts:
175
+ for skip in self.skip_patterns:
176
+ if fnmatch.fnmatch(part, skip):
177
+ return True
178
+ return False
179
+
180
+ def _matches_pattern(self, filename: str) -> bool:
181
+ """Check if filename matches the include pattern."""
182
+ # Support multiple patterns separated by |
183
+ patterns = self.pattern.split("|")
184
+ return any(fnmatch.fnmatch(filename, p.strip()) for p in patterns)
185
+
186
+ def _guess_content_type(self, path: str) -> str:
187
+ """Guess content type from file extension."""
188
+ if "." not in path:
189
+ return "text/plain"
190
+ ext = path.rsplit(".", 1)[-1].lower()
191
+ return self.CONTENT_TYPES.get(ext, "text/plain")
192
+
193
+ async def load(self) -> list[LoadedContent]:
194
+ """Load all matching files from the GitHub repository.
195
+
196
+ Returns:
197
+ List of LoadedContent objects for each matching file.
198
+
199
+ Raises:
200
+ ValueError: If repository not found or access denied.
201
+ httpx.HTTPError: If API request fails and on_error="raise".
202
+ """
203
+ contents: list[LoadedContent] = []
204
+ headers = self._get_headers()
205
+
206
+ async with httpx.AsyncClient(timeout=30.0) as client:
207
+ # Fetch repository tree
208
+ tree_url = f"{self.GITHUB_API}/repos/{self.repo}/git/trees/{self.branch}"
209
+ if self.recursive:
210
+ tree_url += "?recursive=1"
211
+
212
+ logger.debug(f"Fetching tree from: {tree_url}")
213
+
214
+ try:
215
+ resp = await client.get(tree_url, headers=headers)
216
+ resp.raise_for_status()
217
+ except httpx.HTTPStatusError as e:
218
+ if e.response.status_code == 404:
219
+ raise ValueError(
220
+ f"Repository or branch not found: {self.repo}@{self.branch}"
221
+ ) from e
222
+ elif e.response.status_code == 403:
223
+ # Check if it's rate limiting
224
+ remaining = e.response.headers.get("X-RateLimit-Remaining", "?")
225
+ raise ValueError(
226
+ f"GitHub API access denied (rate limit remaining: {remaining}). "
227
+ f"Set GITHUB_TOKEN environment variable for higher limits."
228
+ ) from e
229
+ raise
230
+
231
+ tree_data = resp.json()
232
+ tree = tree_data.get("tree", [])
233
+ truncated = tree_data.get("truncated", False)
234
+
235
+ if truncated:
236
+ logger.warning(
237
+ "Repository tree was truncated by GitHub API. "
238
+ "Some files may not be loaded. Consider narrowing the path."
239
+ )
240
+
241
+ # Filter files by path, pattern, and skip patterns
242
+ path_prefix = f"{self.path}/" if self.path else ""
243
+ matching_files: list[str] = []
244
+
245
+ for item in tree:
246
+ # Only process files (blobs)
247
+ if item.get("type") != "blob":
248
+ continue
249
+
250
+ file_path = item.get("path", "")
251
+
252
+ # Must be under specified path
253
+ if path_prefix and not file_path.startswith(path_prefix):
254
+ continue
255
+
256
+ # Check skip patterns
257
+ if self._should_skip(file_path):
258
+ logger.debug(f"Skipping (matches skip pattern): {file_path}")
259
+ continue
260
+
261
+ # Check include pattern against filename
262
+ filename = file_path.split("/")[-1]
263
+ if not self._matches_pattern(filename):
264
+ continue
265
+
266
+ matching_files.append(file_path)
267
+
268
+ logger.info(f"Found {len(matching_files)} matching files in {self.repo}")
269
+
270
+ # Fetch content for each matching file
271
+ for file_path in matching_files:
272
+ raw_url = f"{self.GITHUB_RAW}/{self.repo}/{self.branch}/{file_path}"
273
+
274
+ try:
275
+ resp = await client.get(raw_url, headers=headers)
276
+ resp.raise_for_status()
277
+ content = resp.text
278
+ except httpx.HTTPError as e:
279
+ msg = f"Failed to fetch {file_path}: {e}"
280
+ if self.on_error == "raise":
281
+ raise RuntimeError(msg) from e
282
+ logger.warning(msg)
283
+ continue
284
+
285
+ # Build relative path from specified base path
286
+ rel_path = file_path[len(path_prefix) :] if path_prefix else file_path
287
+
288
+ loaded = LoadedContent(
289
+ content=content,
290
+ source=f"github://{self.repo}/{file_path}",
291
+ content_type=self._guess_content_type(file_path),
292
+ metadata={
293
+ "loader": "github",
294
+ "repo": self.repo,
295
+ "branch": self.branch,
296
+ "path": rel_path,
297
+ "full_path": file_path,
298
+ **self.extra_metadata,
299
+ },
300
+ )
301
+ contents.append(loaded)
302
+ logger.debug(f"Loaded: {file_path} ({len(content)} chars)")
303
+
304
+ return contents
305
+
306
+ def __repr__(self) -> str:
307
+ """Return string representation."""
308
+ return (
309
+ f"GitHubLoader({self.repo!r}, path={self.path!r}, "
310
+ f"branch={self.branch!r}, pattern={self.pattern!r})"
311
+ )
@@ -0,0 +1,147 @@
1
+ """Data models for content loaders.
2
+
3
+ This module defines the core data structures used by all loaders.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Any
10
+
11
+
12
+ @dataclass
13
+ class LoadedContent:
14
+ """Content loaded from a remote or local source.
15
+
16
+ This is the standard output format for all loaders. It's designed to be
17
+ compatible with ai-infra's Retriever.add_text() method.
18
+
19
+ Attributes:
20
+ content: The text content that was loaded.
21
+ metadata: Flexible metadata dict. Loaders add source-specific metadata
22
+ (e.g., repo, path, branch for GitHub). Users can add custom metadata
23
+ via the loader's `extra_metadata` parameter.
24
+ source: Source identifier (URL, file path, GitHub URI, etc.).
25
+ Format varies by loader:
26
+ - GitHubLoader: "github://owner/repo/path"
27
+ - URLLoader: "https://example.com/page"
28
+ - S3Loader: "s3://bucket/key"
29
+ content_type: MIME type or content category (e.g., "text/markdown",
30
+ "text/x-python", "text/html"). None if unknown.
31
+ encoding: Character encoding (default: utf-8).
32
+
33
+ Example:
34
+ >>> content = LoadedContent(
35
+ ... content="# Authentication\\n\\nThis guide covers...",
36
+ ... source="github://nfraxlab/svc-infra/docs/auth.md",
37
+ ... content_type="text/markdown",
38
+ ... metadata={"repo": "nfraxlab/svc-infra", "path": "docs/auth.md"},
39
+ ... )
40
+ >>>
41
+ >>> # Use with ai-infra Retriever
42
+ >>> retriever.add_text(content.content, metadata=content.metadata)
43
+ """
44
+
45
+ content: str
46
+ metadata: dict[str, Any] = field(default_factory=dict)
47
+ source: str = ""
48
+ content_type: str | None = None
49
+ encoding: str = "utf-8"
50
+
51
+ def __post_init__(self) -> None:
52
+ """Validate and normalize fields after initialization."""
53
+ # Ensure metadata is a dict
54
+ if self.metadata is None:
55
+ self.metadata = {}
56
+
57
+ # Add source to metadata if not already present
58
+ if self.source and "source" not in self.metadata:
59
+ self.metadata["source"] = self.source
60
+
61
+ def to_tuple(self) -> tuple[str, dict[str, Any]]:
62
+ """Convert to (content, metadata) tuple.
63
+
64
+ This format is compatible with ai-infra's Retriever.add_text() and
65
+ the legacy LoadedDocument type from ai-infra/retriever/loaders.py.
66
+
67
+ Returns:
68
+ Tuple of (content, metadata).
69
+
70
+ Example:
71
+ >>> content, metadata = loaded_content.to_tuple()
72
+ >>> retriever.add_text(content, metadata=metadata)
73
+ """
74
+ return (self.content, self.metadata)
75
+
76
+ def to_dict(self) -> dict[str, Any]:
77
+ """Convert to dictionary for JSON serialization.
78
+
79
+ Returns:
80
+ Dictionary with all fields, suitable for JSON serialization.
81
+
82
+ Example:
83
+ >>> data = loaded_content.to_dict()
84
+ >>> json.dumps(data)
85
+ """
86
+ return {
87
+ "content": self.content,
88
+ "metadata": self.metadata,
89
+ "source": self.source,
90
+ "content_type": self.content_type,
91
+ "encoding": self.encoding,
92
+ }
93
+
94
+ @classmethod
95
+ def from_dict(cls, data: dict[str, Any]) -> LoadedContent:
96
+ """Create LoadedContent from a dictionary.
97
+
98
+ Args:
99
+ data: Dictionary with content, metadata, source, etc.
100
+
101
+ Returns:
102
+ New LoadedContent instance.
103
+
104
+ Example:
105
+ >>> data = {"content": "Hello", "source": "test.txt"}
106
+ >>> content = LoadedContent.from_dict(data)
107
+ """
108
+ return cls(
109
+ content=data.get("content", ""),
110
+ metadata=data.get("metadata", {}),
111
+ source=data.get("source", ""),
112
+ content_type=data.get("content_type"),
113
+ encoding=data.get("encoding", "utf-8"),
114
+ )
115
+
116
+ def __len__(self) -> int:
117
+ """Return the length of the content."""
118
+ return len(self.content)
119
+
120
+ def __bool__(self) -> bool:
121
+ """Return True if content is non-empty."""
122
+ return bool(self.content.strip())
123
+
124
+
125
+ # Type alias for backward compatibility with ai-infra loaders
126
+ LoadedDocument = tuple[str, dict[str, Any]]
127
+
128
+
129
+ def to_loaded_documents(contents: list[LoadedContent]) -> list[LoadedDocument]:
130
+ """Convert LoadedContent list to LoadedDocument list.
131
+
132
+ This is a compatibility helper for code that expects the legacy
133
+ (content, metadata) tuple format from ai-infra/retriever/loaders.py.
134
+
135
+ Args:
136
+ contents: List of LoadedContent objects.
137
+
138
+ Returns:
139
+ List of (content, metadata) tuples.
140
+
141
+ Example:
142
+ >>> contents = await loader.load()
143
+ >>> documents = to_loaded_documents(contents)
144
+ >>> for content, metadata in documents:
145
+ ... retriever.add_text(content, metadata=metadata)
146
+ """
147
+ return [c.to_tuple() for c in contents]