svc-infra 0.1.595__py3-none-any.whl → 0.1.706__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of svc-infra might be problematic. Click here for more details.
- svc_infra/__init__.py +58 -2
- svc_infra/apf_payments/models.py +133 -42
- svc_infra/apf_payments/provider/aiydan.py +121 -47
- svc_infra/apf_payments/provider/base.py +30 -9
- svc_infra/apf_payments/provider/stripe.py +156 -62
- svc_infra/apf_payments/schemas.py +18 -9
- svc_infra/apf_payments/service.py +98 -41
- svc_infra/apf_payments/settings.py +5 -1
- svc_infra/api/__init__.py +61 -0
- svc_infra/api/fastapi/__init__.py +15 -0
- svc_infra/api/fastapi/admin/__init__.py +3 -0
- svc_infra/api/fastapi/admin/add.py +245 -0
- svc_infra/api/fastapi/apf_payments/router.py +128 -70
- svc_infra/api/fastapi/apf_payments/setup.py +13 -6
- svc_infra/api/fastapi/auth/__init__.py +65 -0
- svc_infra/api/fastapi/auth/_cookies.py +6 -2
- svc_infra/api/fastapi/auth/add.py +17 -14
- svc_infra/api/fastapi/auth/gaurd.py +45 -16
- svc_infra/api/fastapi/auth/mfa/models.py +3 -1
- svc_infra/api/fastapi/auth/mfa/pre_auth.py +10 -6
- svc_infra/api/fastapi/auth/mfa/router.py +15 -8
- svc_infra/api/fastapi/auth/mfa/security.py +1 -2
- svc_infra/api/fastapi/auth/mfa/utils.py +2 -1
- svc_infra/api/fastapi/auth/mfa/verify.py +9 -2
- svc_infra/api/fastapi/auth/policy.py +0 -1
- svc_infra/api/fastapi/auth/providers.py +3 -1
- svc_infra/api/fastapi/auth/routers/apikey_router.py +6 -6
- svc_infra/api/fastapi/auth/routers/oauth_router.py +146 -52
- svc_infra/api/fastapi/auth/routers/session_router.py +6 -2
- svc_infra/api/fastapi/auth/security.py +31 -10
- svc_infra/api/fastapi/auth/sender.py +8 -1
- svc_infra/api/fastapi/auth/state.py +3 -1
- svc_infra/api/fastapi/auth/ws_security.py +275 -0
- svc_infra/api/fastapi/billing/router.py +73 -0
- svc_infra/api/fastapi/billing/setup.py +19 -0
- svc_infra/api/fastapi/cache/add.py +9 -5
- svc_infra/api/fastapi/db/__init__.py +5 -1
- svc_infra/api/fastapi/db/http.py +3 -1
- svc_infra/api/fastapi/db/nosql/__init__.py +39 -1
- svc_infra/api/fastapi/db/nosql/mongo/add.py +47 -32
- svc_infra/api/fastapi/db/nosql/mongo/crud_router.py +30 -11
- svc_infra/api/fastapi/db/sql/__init__.py +5 -1
- svc_infra/api/fastapi/db/sql/add.py +71 -26
- svc_infra/api/fastapi/db/sql/crud_router.py +210 -22
- svc_infra/api/fastapi/db/sql/health.py +3 -1
- svc_infra/api/fastapi/db/sql/session.py +18 -0
- svc_infra/api/fastapi/db/sql/users.py +18 -6
- svc_infra/api/fastapi/dependencies/ratelimit.py +78 -14
- svc_infra/api/fastapi/docs/add.py +173 -0
- svc_infra/api/fastapi/docs/landing.py +4 -2
- svc_infra/api/fastapi/docs/scoped.py +62 -15
- svc_infra/api/fastapi/dual/__init__.py +12 -2
- svc_infra/api/fastapi/dual/dualize.py +1 -1
- svc_infra/api/fastapi/dual/protected.py +126 -4
- svc_infra/api/fastapi/dual/public.py +25 -0
- svc_infra/api/fastapi/dual/router.py +40 -13
- svc_infra/api/fastapi/dx.py +33 -2
- svc_infra/api/fastapi/ease.py +10 -2
- svc_infra/api/fastapi/http/concurrency.py +2 -1
- svc_infra/api/fastapi/http/conditional.py +3 -1
- svc_infra/api/fastapi/middleware/debug.py +4 -1
- svc_infra/api/fastapi/middleware/errors/catchall.py +6 -2
- svc_infra/api/fastapi/middleware/errors/exceptions.py +1 -1
- svc_infra/api/fastapi/middleware/errors/handlers.py +54 -8
- svc_infra/api/fastapi/middleware/graceful_shutdown.py +104 -0
- svc_infra/api/fastapi/middleware/idempotency.py +197 -70
- svc_infra/api/fastapi/middleware/idempotency_store.py +187 -0
- svc_infra/api/fastapi/middleware/optimistic_lock.py +42 -0
- svc_infra/api/fastapi/middleware/ratelimit.py +125 -28
- svc_infra/api/fastapi/middleware/ratelimit_store.py +43 -10
- svc_infra/api/fastapi/middleware/request_id.py +27 -11
- svc_infra/api/fastapi/middleware/request_size_limit.py +3 -3
- svc_infra/api/fastapi/middleware/timeout.py +177 -0
- svc_infra/api/fastapi/openapi/apply.py +5 -3
- svc_infra/api/fastapi/openapi/conventions.py +9 -2
- svc_infra/api/fastapi/openapi/mutators.py +165 -20
- svc_infra/api/fastapi/openapi/pipeline.py +1 -1
- svc_infra/api/fastapi/openapi/security.py +3 -1
- svc_infra/api/fastapi/ops/add.py +75 -0
- svc_infra/api/fastapi/pagination.py +47 -20
- svc_infra/api/fastapi/routers/__init__.py +43 -15
- svc_infra/api/fastapi/routers/ping.py +1 -0
- svc_infra/api/fastapi/setup.py +188 -57
- svc_infra/api/fastapi/tenancy/add.py +19 -0
- svc_infra/api/fastapi/tenancy/context.py +112 -0
- svc_infra/api/fastapi/versioned.py +101 -0
- svc_infra/app/README.md +5 -5
- svc_infra/app/__init__.py +3 -1
- svc_infra/app/env.py +69 -1
- svc_infra/app/logging/add.py +9 -2
- svc_infra/app/logging/formats.py +12 -5
- svc_infra/billing/__init__.py +23 -0
- svc_infra/billing/async_service.py +147 -0
- svc_infra/billing/jobs.py +241 -0
- svc_infra/billing/models.py +177 -0
- svc_infra/billing/quotas.py +103 -0
- svc_infra/billing/schemas.py +36 -0
- svc_infra/billing/service.py +123 -0
- svc_infra/bundled_docs/README.md +5 -0
- svc_infra/bundled_docs/__init__.py +1 -0
- svc_infra/bundled_docs/getting-started.md +6 -0
- svc_infra/cache/__init__.py +9 -0
- svc_infra/cache/add.py +170 -0
- svc_infra/cache/backend.py +7 -6
- svc_infra/cache/decorators.py +81 -15
- svc_infra/cache/demo.py +2 -2
- svc_infra/cache/keys.py +24 -4
- svc_infra/cache/recache.py +26 -14
- svc_infra/cache/resources.py +14 -5
- svc_infra/cache/tags.py +19 -44
- svc_infra/cache/utils.py +3 -1
- svc_infra/cli/__init__.py +52 -8
- svc_infra/cli/__main__.py +4 -0
- svc_infra/cli/cmds/__init__.py +39 -2
- svc_infra/cli/cmds/db/nosql/mongo/mongo_cmds.py +7 -4
- svc_infra/cli/cmds/db/nosql/mongo/mongo_scaffold_cmds.py +7 -5
- svc_infra/cli/cmds/db/ops_cmds.py +270 -0
- svc_infra/cli/cmds/db/sql/alembic_cmds.py +103 -18
- svc_infra/cli/cmds/db/sql/sql_export_cmds.py +88 -0
- svc_infra/cli/cmds/db/sql/sql_scaffold_cmds.py +3 -3
- svc_infra/cli/cmds/docs/docs_cmds.py +142 -0
- svc_infra/cli/cmds/dx/__init__.py +12 -0
- svc_infra/cli/cmds/dx/dx_cmds.py +116 -0
- svc_infra/cli/cmds/health/__init__.py +179 -0
- svc_infra/cli/cmds/health/health_cmds.py +8 -0
- svc_infra/cli/cmds/help.py +4 -0
- svc_infra/cli/cmds/jobs/__init__.py +1 -0
- svc_infra/cli/cmds/jobs/jobs_cmds.py +47 -0
- svc_infra/cli/cmds/obs/obs_cmds.py +36 -15
- svc_infra/cli/cmds/sdk/__init__.py +0 -0
- svc_infra/cli/cmds/sdk/sdk_cmds.py +112 -0
- svc_infra/cli/foundation/runner.py +6 -2
- svc_infra/data/add.py +61 -0
- svc_infra/data/backup.py +58 -0
- svc_infra/data/erasure.py +45 -0
- svc_infra/data/fixtures.py +42 -0
- svc_infra/data/retention.py +61 -0
- svc_infra/db/__init__.py +15 -0
- svc_infra/db/crud_schema.py +9 -9
- svc_infra/db/inbox.py +67 -0
- svc_infra/db/nosql/__init__.py +3 -0
- svc_infra/db/nosql/core.py +30 -9
- svc_infra/db/nosql/indexes.py +3 -1
- svc_infra/db/nosql/management.py +1 -1
- svc_infra/db/nosql/mongo/README.md +13 -13
- svc_infra/db/nosql/mongo/client.py +19 -2
- svc_infra/db/nosql/mongo/settings.py +6 -2
- svc_infra/db/nosql/repository.py +35 -15
- svc_infra/db/nosql/resource.py +20 -3
- svc_infra/db/nosql/scaffold.py +9 -3
- svc_infra/db/nosql/service.py +3 -1
- svc_infra/db/nosql/types.py +6 -2
- svc_infra/db/ops.py +384 -0
- svc_infra/db/outbox.py +108 -0
- svc_infra/db/sql/apikey.py +37 -9
- svc_infra/db/sql/authref.py +9 -3
- svc_infra/db/sql/constants.py +12 -8
- svc_infra/db/sql/core.py +2 -2
- svc_infra/db/sql/management.py +11 -8
- svc_infra/db/sql/repository.py +99 -26
- svc_infra/db/sql/resource.py +5 -0
- svc_infra/db/sql/scaffold.py +6 -2
- svc_infra/db/sql/service.py +15 -5
- svc_infra/db/sql/templates/models_schemas/auth/models.py.tmpl +7 -56
- svc_infra/db/sql/templates/setup/env_async.py.tmpl +34 -12
- svc_infra/db/sql/templates/setup/env_sync.py.tmpl +29 -7
- svc_infra/db/sql/tenant.py +88 -0
- svc_infra/db/sql/uniq_hooks.py +9 -3
- svc_infra/db/sql/utils.py +138 -51
- svc_infra/db/sql/versioning.py +14 -0
- svc_infra/deploy/__init__.py +538 -0
- svc_infra/documents/__init__.py +100 -0
- svc_infra/documents/add.py +264 -0
- svc_infra/documents/ease.py +233 -0
- svc_infra/documents/models.py +114 -0
- svc_infra/documents/storage.py +264 -0
- svc_infra/dx/add.py +65 -0
- svc_infra/dx/changelog.py +74 -0
- svc_infra/dx/checks.py +68 -0
- svc_infra/exceptions.py +141 -0
- svc_infra/health/__init__.py +864 -0
- svc_infra/http/__init__.py +13 -0
- svc_infra/http/client.py +105 -0
- svc_infra/jobs/builtins/outbox_processor.py +40 -0
- svc_infra/jobs/builtins/webhook_delivery.py +95 -0
- svc_infra/jobs/easy.py +33 -0
- svc_infra/jobs/loader.py +50 -0
- svc_infra/jobs/queue.py +116 -0
- svc_infra/jobs/redis_queue.py +256 -0
- svc_infra/jobs/runner.py +79 -0
- svc_infra/jobs/scheduler.py +53 -0
- svc_infra/jobs/worker.py +40 -0
- svc_infra/loaders/__init__.py +186 -0
- svc_infra/loaders/base.py +142 -0
- svc_infra/loaders/github.py +311 -0
- svc_infra/loaders/models.py +147 -0
- svc_infra/loaders/url.py +235 -0
- svc_infra/logging/__init__.py +374 -0
- svc_infra/mcp/svc_infra_mcp.py +91 -33
- svc_infra/obs/README.md +2 -0
- svc_infra/obs/add.py +65 -9
- svc_infra/obs/cloud_dash.py +2 -1
- svc_infra/obs/grafana/dashboards/http-overview.json +45 -0
- svc_infra/obs/metrics/__init__.py +3 -4
- svc_infra/obs/metrics/asgi.py +13 -7
- svc_infra/obs/metrics/http.py +9 -5
- svc_infra/obs/metrics/sqlalchemy.py +13 -9
- svc_infra/obs/metrics.py +6 -5
- svc_infra/obs/settings.py +6 -2
- svc_infra/security/add.py +217 -0
- svc_infra/security/audit.py +92 -10
- svc_infra/security/audit_service.py +4 -3
- svc_infra/security/headers.py +15 -2
- svc_infra/security/hibp.py +14 -4
- svc_infra/security/jwt_rotation.py +74 -22
- svc_infra/security/lockout.py +11 -5
- svc_infra/security/models.py +54 -12
- svc_infra/security/oauth_models.py +73 -0
- svc_infra/security/org_invites.py +5 -3
- svc_infra/security/passwords.py +3 -1
- svc_infra/security/permissions.py +25 -2
- svc_infra/security/session.py +1 -1
- svc_infra/security/signed_cookies.py +21 -1
- svc_infra/storage/__init__.py +93 -0
- svc_infra/storage/add.py +253 -0
- svc_infra/storage/backends/__init__.py +11 -0
- svc_infra/storage/backends/local.py +339 -0
- svc_infra/storage/backends/memory.py +216 -0
- svc_infra/storage/backends/s3.py +353 -0
- svc_infra/storage/base.py +239 -0
- svc_infra/storage/easy.py +185 -0
- svc_infra/storage/settings.py +195 -0
- svc_infra/testing/__init__.py +685 -0
- svc_infra/utils.py +7 -3
- svc_infra/webhooks/__init__.py +69 -0
- svc_infra/webhooks/add.py +339 -0
- svc_infra/webhooks/encryption.py +115 -0
- svc_infra/webhooks/fastapi.py +39 -0
- svc_infra/webhooks/router.py +55 -0
- svc_infra/webhooks/service.py +70 -0
- svc_infra/webhooks/signing.py +34 -0
- svc_infra/websocket/__init__.py +79 -0
- svc_infra/websocket/add.py +140 -0
- svc_infra/websocket/client.py +282 -0
- svc_infra/websocket/config.py +69 -0
- svc_infra/websocket/easy.py +76 -0
- svc_infra/websocket/exceptions.py +61 -0
- svc_infra/websocket/manager.py +344 -0
- svc_infra/websocket/models.py +49 -0
- svc_infra-0.1.706.dist-info/LICENSE +21 -0
- svc_infra-0.1.706.dist-info/METADATA +356 -0
- svc_infra-0.1.706.dist-info/RECORD +357 -0
- svc_infra-0.1.595.dist-info/METADATA +0 -80
- svc_infra-0.1.595.dist-info/RECORD +0 -253
- {svc_infra-0.1.595.dist-info → svc_infra-0.1.706.dist-info}/WHEEL +0 -0
- {svc_infra-0.1.595.dist-info → svc_infra-0.1.706.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Base loader class for content loaders.
|
|
2
|
+
|
|
3
|
+
This module defines the abstract base class that all loaders must implement.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import TYPE_CHECKING, AsyncIterator, Literal
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .models import LoadedContent
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Error handling strategies
|
|
20
|
+
ErrorStrategy = Literal["skip", "raise"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseLoader(ABC):
|
|
24
|
+
"""Abstract base class for content loaders.
|
|
25
|
+
|
|
26
|
+
All loaders are async-first with sync wrappers provided for convenience.
|
|
27
|
+
Subclasses must implement the `load()` method.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
on_error: How to handle errors during loading.
|
|
31
|
+
- "skip" (default): Log warning and skip failed items
|
|
32
|
+
- "raise": Raise exception on first failure
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> class MyLoader(BaseLoader):
|
|
36
|
+
... async def load(self) -> list[LoadedContent]:
|
|
37
|
+
... # Implement loading logic
|
|
38
|
+
... return [LoadedContent(content="...", source="...")]
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Async usage (preferred)
|
|
41
|
+
>>> loader = MyLoader()
|
|
42
|
+
>>> contents = await loader.load()
|
|
43
|
+
>>>
|
|
44
|
+
>>> # Sync usage (convenience)
|
|
45
|
+
>>> contents = loader.load_sync()
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
on_error: ErrorStrategy = "skip",
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Initialize the base loader.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
on_error: Error handling strategy ("skip" or "raise").
|
|
56
|
+
"""
|
|
57
|
+
self.on_error = on_error
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
async def load(self) -> list["LoadedContent"]:
|
|
61
|
+
"""Load all content from the source.
|
|
62
|
+
|
|
63
|
+
This is the main method that subclasses must implement.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of LoadedContent objects.
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
Various exceptions depending on the loader and error strategy.
|
|
70
|
+
"""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
async def aiter(self) -> AsyncIterator["LoadedContent"]:
|
|
74
|
+
"""Iterate over loaded content asynchronously.
|
|
75
|
+
|
|
76
|
+
This is useful for progress tracking or streaming large datasets.
|
|
77
|
+
Default implementation loads all content first, but subclasses
|
|
78
|
+
can override for true streaming.
|
|
79
|
+
|
|
80
|
+
Yields:
|
|
81
|
+
LoadedContent objects as they are loaded.
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
>>> async for content in loader.aiter():
|
|
85
|
+
... print(f"Loaded: {content.source}")
|
|
86
|
+
... process(content)
|
|
87
|
+
"""
|
|
88
|
+
for content in await self.load():
|
|
89
|
+
yield content
|
|
90
|
+
|
|
91
|
+
def load_sync(self) -> list["LoadedContent"]:
|
|
92
|
+
"""Synchronous wrapper for load().
|
|
93
|
+
|
|
94
|
+
Creates a new event loop if needed. Prefer the async version
|
|
95
|
+
when running in an async context.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
List of LoadedContent objects.
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
>>> # In a script or notebook
|
|
102
|
+
>>> loader = GitHubLoader("nfraxlab/svc-infra", path="docs")
|
|
103
|
+
>>> contents = loader.load_sync()
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
# Check if we're already in an async context
|
|
107
|
+
loop = asyncio.get_running_loop()
|
|
108
|
+
except RuntimeError:
|
|
109
|
+
# No running loop - safe to use asyncio.run()
|
|
110
|
+
return asyncio.run(self.load())
|
|
111
|
+
|
|
112
|
+
# Already in async context - use nest_asyncio if available
|
|
113
|
+
try:
|
|
114
|
+
import nest_asyncio
|
|
115
|
+
|
|
116
|
+
nest_asyncio.apply()
|
|
117
|
+
return loop.run_until_complete(self.load())
|
|
118
|
+
except ImportError:
|
|
119
|
+
raise RuntimeError(
|
|
120
|
+
"Cannot call load_sync() from within an async context. "
|
|
121
|
+
"Use 'await loader.load()' instead, or install nest_asyncio: "
|
|
122
|
+
"pip install nest-asyncio"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def _handle_error(self, error: Exception, context: str) -> None:
|
|
126
|
+
"""Handle an error according to the error strategy.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
error: The exception that occurred.
|
|
130
|
+
context: Description of what was being done (for logging).
|
|
131
|
+
|
|
132
|
+
Raises:
|
|
133
|
+
The original exception if on_error="raise".
|
|
134
|
+
"""
|
|
135
|
+
if self.on_error == "raise":
|
|
136
|
+
raise error
|
|
137
|
+
else:
|
|
138
|
+
logger.warning(f"Skipping {context}: {error}")
|
|
139
|
+
|
|
140
|
+
def __repr__(self) -> str:
|
|
141
|
+
"""Return string representation of the loader."""
|
|
142
|
+
return f"{self.__class__.__name__}(on_error={self.on_error!r})"
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""GitHub content loader.
|
|
2
|
+
|
|
3
|
+
Load files from GitHub repositories using the GitHub API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import fnmatch
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
from .base import BaseLoader, ErrorStrategy
|
|
16
|
+
from .models import LoadedContent
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GitHubLoader(BaseLoader):
|
|
22
|
+
"""Load files from a GitHub repository.
|
|
23
|
+
|
|
24
|
+
Fetches files matching a pattern from a GitHub repo using the GitHub API.
|
|
25
|
+
Supports public repos and private repos (with token).
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
repo: Repository in "owner/repo" format (e.g., "nfraxlab/svc-infra")
|
|
29
|
+
path: Path within repo to load from (e.g., "docs", "examples/src").
|
|
30
|
+
Empty string means repo root.
|
|
31
|
+
branch: Branch name (default: "main")
|
|
32
|
+
pattern: Glob pattern for files to include (default: "*.md").
|
|
33
|
+
Use "*" to match all files.
|
|
34
|
+
token: GitHub token for private repos or higher rate limits.
|
|
35
|
+
Falls back to GITHUB_TOKEN environment variable.
|
|
36
|
+
recursive: Whether to search subdirectories (default: True)
|
|
37
|
+
skip_patterns: List of patterns to skip. Default patterns are:
|
|
38
|
+
__pycache__, *.pyc, *.pyo, .git, node_modules, *.lock, .env*
|
|
39
|
+
extra_metadata: Additional metadata to attach to all loaded content.
|
|
40
|
+
on_error: How to handle errors ("skip" or "raise"). Default: "skip"
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> # Load all markdown from docs/
|
|
44
|
+
>>> loader = GitHubLoader("nfraxlab/svc-infra", path="docs")
|
|
45
|
+
>>> contents = await loader.load()
|
|
46
|
+
>>> for c in contents:
|
|
47
|
+
... print(f"Loaded: {c.source}")
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Load Python files from examples
|
|
50
|
+
>>> loader = GitHubLoader(
|
|
51
|
+
... "nfraxlab/svc-infra",
|
|
52
|
+
... path="examples/src",
|
|
53
|
+
... pattern="*.py",
|
|
54
|
+
... skip_patterns=["__pycache__", "test_*"],
|
|
55
|
+
... )
|
|
56
|
+
>>> contents = await loader.load()
|
|
57
|
+
>>>
|
|
58
|
+
>>> # Private repo with token
|
|
59
|
+
>>> loader = GitHubLoader(
|
|
60
|
+
... "myorg/private-repo",
|
|
61
|
+
... token="ghp_xxxx", # or set GITHUB_TOKEN env var
|
|
62
|
+
... )
|
|
63
|
+
>>> contents = await loader.load()
|
|
64
|
+
|
|
65
|
+
Note:
|
|
66
|
+
- GitHub API rate limits: 60 requests/hour unauthenticated,
|
|
67
|
+
5000 requests/hour with token
|
|
68
|
+
- Large repos may require multiple API calls (tree is fetched recursively)
|
|
69
|
+
- Binary files are automatically skipped
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
GITHUB_API = "https://api.github.com"
|
|
73
|
+
GITHUB_RAW = "https://raw.githubusercontent.com"
|
|
74
|
+
|
|
75
|
+
DEFAULT_SKIP_PATTERNS: list[str] = [
|
|
76
|
+
"__pycache__",
|
|
77
|
+
"*.pyc",
|
|
78
|
+
"*.pyo",
|
|
79
|
+
".git",
|
|
80
|
+
".github",
|
|
81
|
+
"node_modules",
|
|
82
|
+
"*.lock",
|
|
83
|
+
".env*",
|
|
84
|
+
".DS_Store",
|
|
85
|
+
"*.egg-info",
|
|
86
|
+
"dist",
|
|
87
|
+
"build",
|
|
88
|
+
"*.min.js",
|
|
89
|
+
"*.min.css",
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# Content types by extension
|
|
93
|
+
CONTENT_TYPES: dict[str, str] = {
|
|
94
|
+
"md": "text/markdown",
|
|
95
|
+
"py": "text/x-python",
|
|
96
|
+
"json": "application/json",
|
|
97
|
+
"yaml": "text/yaml",
|
|
98
|
+
"yml": "text/yaml",
|
|
99
|
+
"toml": "text/toml",
|
|
100
|
+
"sql": "text/x-sql",
|
|
101
|
+
"html": "text/html",
|
|
102
|
+
"css": "text/css",
|
|
103
|
+
"js": "text/javascript",
|
|
104
|
+
"ts": "text/typescript",
|
|
105
|
+
"tsx": "text/typescript",
|
|
106
|
+
"jsx": "text/javascript",
|
|
107
|
+
"txt": "text/plain",
|
|
108
|
+
"rst": "text/x-rst",
|
|
109
|
+
"ini": "text/plain",
|
|
110
|
+
"cfg": "text/plain",
|
|
111
|
+
"sh": "text/x-shellscript",
|
|
112
|
+
"bash": "text/x-shellscript",
|
|
113
|
+
"zsh": "text/x-shellscript",
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def __init__(
|
|
117
|
+
self,
|
|
118
|
+
repo: str,
|
|
119
|
+
path: str = "",
|
|
120
|
+
branch: str = "main",
|
|
121
|
+
pattern: str = "*.md",
|
|
122
|
+
token: str | None = None,
|
|
123
|
+
recursive: bool = True,
|
|
124
|
+
skip_patterns: list[str] | None = None,
|
|
125
|
+
extra_metadata: dict[str, Any] | None = None,
|
|
126
|
+
on_error: ErrorStrategy = "skip",
|
|
127
|
+
) -> None:
|
|
128
|
+
"""Initialize the GitHub loader.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
repo: Repository in "owner/repo" format
|
|
132
|
+
path: Path within repo (empty string for root)
|
|
133
|
+
branch: Branch name
|
|
134
|
+
pattern: Glob pattern for files to include
|
|
135
|
+
token: GitHub token (or use GITHUB_TOKEN env var)
|
|
136
|
+
recursive: Search subdirectories
|
|
137
|
+
skip_patterns: Patterns to skip (overrides defaults if provided)
|
|
138
|
+
extra_metadata: Additional metadata for all content
|
|
139
|
+
on_error: Error handling strategy
|
|
140
|
+
"""
|
|
141
|
+
super().__init__(on_error=on_error)
|
|
142
|
+
|
|
143
|
+
# Validate repo format
|
|
144
|
+
if "/" not in repo or repo.count("/") != 1:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Invalid repo format: {repo!r}. Expected 'owner/repo' format."
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
self.repo = repo
|
|
150
|
+
self.path = path.strip("/")
|
|
151
|
+
self.branch = branch
|
|
152
|
+
self.pattern = pattern
|
|
153
|
+
self.token = token or os.getenv("GITHUB_TOKEN", "")
|
|
154
|
+
self.recursive = recursive
|
|
155
|
+
self.skip_patterns = (
|
|
156
|
+
skip_patterns if skip_patterns is not None else self.DEFAULT_SKIP_PATTERNS
|
|
157
|
+
)
|
|
158
|
+
self.extra_metadata = extra_metadata or {}
|
|
159
|
+
|
|
160
|
+
def _get_headers(self) -> dict[str, str]:
|
|
161
|
+
"""Get headers for GitHub API requests."""
|
|
162
|
+
headers = {
|
|
163
|
+
"Accept": "application/vnd.github.v3+json",
|
|
164
|
+
"User-Agent": "svc-infra-loader",
|
|
165
|
+
}
|
|
166
|
+
if self.token:
|
|
167
|
+
headers["Authorization"] = f"token {self.token}"
|
|
168
|
+
return headers
|
|
169
|
+
|
|
170
|
+
def _should_skip(self, file_path: str) -> bool:
|
|
171
|
+
"""Check if file should be skipped based on patterns."""
|
|
172
|
+
# Check each component of the path against skip patterns
|
|
173
|
+
parts = file_path.split("/")
|
|
174
|
+
for part in parts:
|
|
175
|
+
for skip in self.skip_patterns:
|
|
176
|
+
if fnmatch.fnmatch(part, skip):
|
|
177
|
+
return True
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
def _matches_pattern(self, filename: str) -> bool:
|
|
181
|
+
"""Check if filename matches the include pattern."""
|
|
182
|
+
# Support multiple patterns separated by |
|
|
183
|
+
patterns = self.pattern.split("|")
|
|
184
|
+
return any(fnmatch.fnmatch(filename, p.strip()) for p in patterns)
|
|
185
|
+
|
|
186
|
+
def _guess_content_type(self, path: str) -> str:
|
|
187
|
+
"""Guess content type from file extension."""
|
|
188
|
+
if "." not in path:
|
|
189
|
+
return "text/plain"
|
|
190
|
+
ext = path.rsplit(".", 1)[-1].lower()
|
|
191
|
+
return self.CONTENT_TYPES.get(ext, "text/plain")
|
|
192
|
+
|
|
193
|
+
async def load(self) -> list[LoadedContent]:
|
|
194
|
+
"""Load all matching files from the GitHub repository.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
List of LoadedContent objects for each matching file.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
ValueError: If repository not found or access denied.
|
|
201
|
+
httpx.HTTPError: If API request fails and on_error="raise".
|
|
202
|
+
"""
|
|
203
|
+
contents: list[LoadedContent] = []
|
|
204
|
+
headers = self._get_headers()
|
|
205
|
+
|
|
206
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
207
|
+
# Fetch repository tree
|
|
208
|
+
tree_url = f"{self.GITHUB_API}/repos/{self.repo}/git/trees/{self.branch}"
|
|
209
|
+
if self.recursive:
|
|
210
|
+
tree_url += "?recursive=1"
|
|
211
|
+
|
|
212
|
+
logger.debug(f"Fetching tree from: {tree_url}")
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
resp = await client.get(tree_url, headers=headers)
|
|
216
|
+
resp.raise_for_status()
|
|
217
|
+
except httpx.HTTPStatusError as e:
|
|
218
|
+
if e.response.status_code == 404:
|
|
219
|
+
raise ValueError(
|
|
220
|
+
f"Repository or branch not found: {self.repo}@{self.branch}"
|
|
221
|
+
) from e
|
|
222
|
+
elif e.response.status_code == 403:
|
|
223
|
+
# Check if it's rate limiting
|
|
224
|
+
remaining = e.response.headers.get("X-RateLimit-Remaining", "?")
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"GitHub API access denied (rate limit remaining: {remaining}). "
|
|
227
|
+
f"Set GITHUB_TOKEN environment variable for higher limits."
|
|
228
|
+
) from e
|
|
229
|
+
raise
|
|
230
|
+
|
|
231
|
+
tree_data = resp.json()
|
|
232
|
+
tree = tree_data.get("tree", [])
|
|
233
|
+
truncated = tree_data.get("truncated", False)
|
|
234
|
+
|
|
235
|
+
if truncated:
|
|
236
|
+
logger.warning(
|
|
237
|
+
"Repository tree was truncated by GitHub API. "
|
|
238
|
+
"Some files may not be loaded. Consider narrowing the path."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Filter files by path, pattern, and skip patterns
|
|
242
|
+
path_prefix = f"{self.path}/" if self.path else ""
|
|
243
|
+
matching_files: list[str] = []
|
|
244
|
+
|
|
245
|
+
for item in tree:
|
|
246
|
+
# Only process files (blobs)
|
|
247
|
+
if item.get("type") != "blob":
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
file_path = item.get("path", "")
|
|
251
|
+
|
|
252
|
+
# Must be under specified path
|
|
253
|
+
if path_prefix and not file_path.startswith(path_prefix):
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Check skip patterns
|
|
257
|
+
if self._should_skip(file_path):
|
|
258
|
+
logger.debug(f"Skipping (matches skip pattern): {file_path}")
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Check include pattern against filename
|
|
262
|
+
filename = file_path.split("/")[-1]
|
|
263
|
+
if not self._matches_pattern(filename):
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
matching_files.append(file_path)
|
|
267
|
+
|
|
268
|
+
logger.info(f"Found {len(matching_files)} matching files in {self.repo}")
|
|
269
|
+
|
|
270
|
+
# Fetch content for each matching file
|
|
271
|
+
for file_path in matching_files:
|
|
272
|
+
raw_url = f"{self.GITHUB_RAW}/{self.repo}/{self.branch}/{file_path}"
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
resp = await client.get(raw_url, headers=headers)
|
|
276
|
+
resp.raise_for_status()
|
|
277
|
+
content = resp.text
|
|
278
|
+
except httpx.HTTPError as e:
|
|
279
|
+
msg = f"Failed to fetch {file_path}: {e}"
|
|
280
|
+
if self.on_error == "raise":
|
|
281
|
+
raise RuntimeError(msg) from e
|
|
282
|
+
logger.warning(msg)
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
# Build relative path from specified base path
|
|
286
|
+
rel_path = file_path[len(path_prefix) :] if path_prefix else file_path
|
|
287
|
+
|
|
288
|
+
loaded = LoadedContent(
|
|
289
|
+
content=content,
|
|
290
|
+
source=f"github://{self.repo}/{file_path}",
|
|
291
|
+
content_type=self._guess_content_type(file_path),
|
|
292
|
+
metadata={
|
|
293
|
+
"loader": "github",
|
|
294
|
+
"repo": self.repo,
|
|
295
|
+
"branch": self.branch,
|
|
296
|
+
"path": rel_path,
|
|
297
|
+
"full_path": file_path,
|
|
298
|
+
**self.extra_metadata,
|
|
299
|
+
},
|
|
300
|
+
)
|
|
301
|
+
contents.append(loaded)
|
|
302
|
+
logger.debug(f"Loaded: {file_path} ({len(content)} chars)")
|
|
303
|
+
|
|
304
|
+
return contents
|
|
305
|
+
|
|
306
|
+
def __repr__(self) -> str:
|
|
307
|
+
"""Return string representation."""
|
|
308
|
+
return (
|
|
309
|
+
f"GitHubLoader({self.repo!r}, path={self.path!r}, "
|
|
310
|
+
f"branch={self.branch!r}, pattern={self.pattern!r})"
|
|
311
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Data models for content loaders.
|
|
2
|
+
|
|
3
|
+
This module defines the core data structures used by all loaders.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class LoadedContent:
|
|
14
|
+
"""Content loaded from a remote or local source.
|
|
15
|
+
|
|
16
|
+
This is the standard output format for all loaders. It's designed to be
|
|
17
|
+
compatible with ai-infra's Retriever.add_text() method.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
content: The text content that was loaded.
|
|
21
|
+
metadata: Flexible metadata dict. Loaders add source-specific metadata
|
|
22
|
+
(e.g., repo, path, branch for GitHub). Users can add custom metadata
|
|
23
|
+
via the loader's `extra_metadata` parameter.
|
|
24
|
+
source: Source identifier (URL, file path, GitHub URI, etc.).
|
|
25
|
+
Format varies by loader:
|
|
26
|
+
- GitHubLoader: "github://owner/repo/path"
|
|
27
|
+
- URLLoader: "https://example.com/page"
|
|
28
|
+
- S3Loader: "s3://bucket/key"
|
|
29
|
+
content_type: MIME type or content category (e.g., "text/markdown",
|
|
30
|
+
"text/x-python", "text/html"). None if unknown.
|
|
31
|
+
encoding: Character encoding (default: utf-8).
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> content = LoadedContent(
|
|
35
|
+
... content="# Authentication\\n\\nThis guide covers...",
|
|
36
|
+
... source="github://nfraxlab/svc-infra/docs/auth.md",
|
|
37
|
+
... content_type="text/markdown",
|
|
38
|
+
... metadata={"repo": "nfraxlab/svc-infra", "path": "docs/auth.md"},
|
|
39
|
+
... )
|
|
40
|
+
>>>
|
|
41
|
+
>>> # Use with ai-infra Retriever
|
|
42
|
+
>>> retriever.add_text(content.content, metadata=content.metadata)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
content: str
|
|
46
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
47
|
+
source: str = ""
|
|
48
|
+
content_type: str | None = None
|
|
49
|
+
encoding: str = "utf-8"
|
|
50
|
+
|
|
51
|
+
def __post_init__(self) -> None:
|
|
52
|
+
"""Validate and normalize fields after initialization."""
|
|
53
|
+
# Ensure metadata is a dict
|
|
54
|
+
if self.metadata is None:
|
|
55
|
+
self.metadata = {}
|
|
56
|
+
|
|
57
|
+
# Add source to metadata if not already present
|
|
58
|
+
if self.source and "source" not in self.metadata:
|
|
59
|
+
self.metadata["source"] = self.source
|
|
60
|
+
|
|
61
|
+
def to_tuple(self) -> tuple[str, dict[str, Any]]:
|
|
62
|
+
"""Convert to (content, metadata) tuple.
|
|
63
|
+
|
|
64
|
+
This format is compatible with ai-infra's Retriever.add_text() and
|
|
65
|
+
the legacy LoadedDocument type from ai-infra/retriever/loaders.py.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Tuple of (content, metadata).
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> content, metadata = loaded_content.to_tuple()
|
|
72
|
+
>>> retriever.add_text(content, metadata=metadata)
|
|
73
|
+
"""
|
|
74
|
+
return (self.content, self.metadata)
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> dict[str, Any]:
|
|
77
|
+
"""Convert to dictionary for JSON serialization.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Dictionary with all fields, suitable for JSON serialization.
|
|
81
|
+
|
|
82
|
+
Example:
|
|
83
|
+
>>> data = loaded_content.to_dict()
|
|
84
|
+
>>> json.dumps(data)
|
|
85
|
+
"""
|
|
86
|
+
return {
|
|
87
|
+
"content": self.content,
|
|
88
|
+
"metadata": self.metadata,
|
|
89
|
+
"source": self.source,
|
|
90
|
+
"content_type": self.content_type,
|
|
91
|
+
"encoding": self.encoding,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_dict(cls, data: dict[str, Any]) -> LoadedContent:
|
|
96
|
+
"""Create LoadedContent from a dictionary.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
data: Dictionary with content, metadata, source, etc.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
New LoadedContent instance.
|
|
103
|
+
|
|
104
|
+
Example:
|
|
105
|
+
>>> data = {"content": "Hello", "source": "test.txt"}
|
|
106
|
+
>>> content = LoadedContent.from_dict(data)
|
|
107
|
+
"""
|
|
108
|
+
return cls(
|
|
109
|
+
content=data.get("content", ""),
|
|
110
|
+
metadata=data.get("metadata", {}),
|
|
111
|
+
source=data.get("source", ""),
|
|
112
|
+
content_type=data.get("content_type"),
|
|
113
|
+
encoding=data.get("encoding", "utf-8"),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def __len__(self) -> int:
|
|
117
|
+
"""Return the length of the content."""
|
|
118
|
+
return len(self.content)
|
|
119
|
+
|
|
120
|
+
def __bool__(self) -> bool:
|
|
121
|
+
"""Return True if content is non-empty."""
|
|
122
|
+
return bool(self.content.strip())
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# Type alias for backward compatibility with ai-infra loaders
|
|
126
|
+
LoadedDocument = tuple[str, dict[str, Any]]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def to_loaded_documents(contents: list[LoadedContent]) -> list[LoadedDocument]:
|
|
130
|
+
"""Convert LoadedContent list to LoadedDocument list.
|
|
131
|
+
|
|
132
|
+
This is a compatibility helper for code that expects the legacy
|
|
133
|
+
(content, metadata) tuple format from ai-infra/retriever/loaders.py.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
contents: List of LoadedContent objects.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of (content, metadata) tuples.
|
|
140
|
+
|
|
141
|
+
Example:
|
|
142
|
+
>>> contents = await loader.load()
|
|
143
|
+
>>> documents = to_loaded_documents(contents)
|
|
144
|
+
>>> for content, metadata in documents:
|
|
145
|
+
... retriever.add_text(content, metadata=metadata)
|
|
146
|
+
"""
|
|
147
|
+
return [c.to_tuple() for c in contents]
|