remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,935 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Git repository provider for versioned schema and experiment syncing.
|
|
3
|
+
|
|
4
|
+
Enables REM to sync agent schemas, evaluators, and experiments from Git repositories
|
|
5
|
+
using SSH or HTTPS authentication. Designed for Kubernetes cluster environments with
|
|
6
|
+
proper secret management via Kubernetes Secrets or IRSA/Workload Identity.
|
|
7
|
+
|
|
8
|
+
**Architecture Pattern**: git-sync sidecar
|
|
9
|
+
- Primary use case: Kubernetes pods with git-sync sidecar container
|
|
10
|
+
- Alternative: Direct cloning from application code (this implementation)
|
|
11
|
+
- Caching: Local filesystem cache to minimize network traffic
|
|
12
|
+
|
|
13
|
+
**Use Cases**:
|
|
14
|
+
1. **Agent Schema Versioning**:
|
|
15
|
+
- Sync agent schemas from git://repo/schemas/
|
|
16
|
+
- Checkout specific tags/releases for reproducible builds
|
|
17
|
+
- Multi-environment: dev uses main branch, prod uses release tags
|
|
18
|
+
|
|
19
|
+
2. **Experiment Tracking**:
|
|
20
|
+
- Store evaluation datasets in git://repo/experiments/
|
|
21
|
+
- Version control for ground truth data
|
|
22
|
+
- CI/CD integration: commit → test → deploy
|
|
23
|
+
|
|
24
|
+
3. **Multi-Tenancy**:
|
|
25
|
+
- Different tenants use different repos/branches
|
|
26
|
+
- Tenant-specific schema overrides
|
|
27
|
+
- Centralized schema library with tenant customization
|
|
28
|
+
|
|
29
|
+
**Authentication Methods**:
|
|
30
|
+
|
|
31
|
+
1. **SSH (Production Recommended)**:
|
|
32
|
+
- Uses SSH keys from Kubernetes Secrets
|
|
33
|
+
- Key stored at /etc/git-secret/ssh (0400 permissions)
|
|
34
|
+
- Known hosts at /etc/git-secret/known_hosts
|
|
35
|
+
- No rate limits, full Git protocol support
|
|
36
|
+
- Example URL: ssh://git@github.com/org/repo.git
|
|
37
|
+
|
|
38
|
+
2. **HTTPS with Personal Access Token**:
|
|
39
|
+
- GitHub PAT: 5,000 API requests/hour per authenticated user
|
|
40
|
+
- GitLab PAT: Similar rate limits
|
|
41
|
+
- Easier local development setup
|
|
42
|
+
- Example URL: https://github.com/org/repo.git
|
|
43
|
+
|
|
44
|
+
**Kubernetes Secret Management**:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Create secret with SSH key and known_hosts
|
|
48
|
+
kubectl create secret generic git-creds \\
|
|
49
|
+
--from-file=ssh=$HOME/.ssh/id_rsa \\
|
|
50
|
+
--from-file=known_hosts=$HOME/.ssh/known_hosts
|
|
51
|
+
|
|
52
|
+
# Pod spec
|
|
53
|
+
apiVersion: v1
|
|
54
|
+
kind: Pod
|
|
55
|
+
metadata:
|
|
56
|
+
name: rem-api
|
|
57
|
+
spec:
|
|
58
|
+
volumes:
|
|
59
|
+
- name: git-secret
|
|
60
|
+
secret:
|
|
61
|
+
secretName: git-creds
|
|
62
|
+
defaultMode: 0400 # Read-only for owner
|
|
63
|
+
containers:
|
|
64
|
+
- name: rem-api
|
|
65
|
+
image: rem-api:latest
|
|
66
|
+
volumeMounts:
|
|
67
|
+
- name: git-secret
|
|
68
|
+
mountPath: /etc/git-secret
|
|
69
|
+
readOnly: true
|
|
70
|
+
securityContext:
|
|
71
|
+
fsGroup: 65533 # git user group
|
|
72
|
+
env:
|
|
73
|
+
- name: GIT__ENABLED
|
|
74
|
+
value: "true"
|
|
75
|
+
- name: GIT__DEFAULT_REPO_URL
|
|
76
|
+
value: "ssh://git@github.com/my-org/my-repo.git"
|
|
77
|
+
- name: GIT__SSH_KEY_PATH
|
|
78
|
+
value: "/etc/git-secret/ssh"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Path Conventions**:
|
|
82
|
+
- URI format: git://repo_url/path/to/file.yaml
|
|
83
|
+
- Local cache: {cache_dir}/{repo_hash}/{path/to/file.yaml}
|
|
84
|
+
- Agent schemas: git://repo/schemas/agent-name.yaml
|
|
85
|
+
- Experiments: git://repo/experiments/experiment-name/
|
|
86
|
+
- Evaluators: git://repo/schemas/evaluators/evaluator-name.yaml
|
|
87
|
+
|
|
88
|
+
**Sparse Checkout** (Future Enhancement):
|
|
89
|
+
- Only checkout specific directories (schemas/, experiments/)
|
|
90
|
+
- Reduces clone size for large mono-repos
|
|
91
|
+
- Faster sync times
|
|
92
|
+
|
|
93
|
+
**Examples**:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from rem.services.fs import FS
|
|
97
|
+
from rem.settings import settings
|
|
98
|
+
|
|
99
|
+
# Enable Git provider
|
|
100
|
+
settings.git.enabled = True
|
|
101
|
+
settings.git.default_repo_url = "ssh://git@github.com/org/repo.git"
|
|
102
|
+
|
|
103
|
+
fs = FS()
|
|
104
|
+
|
|
105
|
+
# Read agent schema from git repo at specific tag
|
|
106
|
+
schema = fs.read("git://schemas/cv-parser-v1.yaml?ref=v1.0.0")
|
|
107
|
+
|
|
108
|
+
# Read from main branch (default)
|
|
109
|
+
schema = fs.read("git://schemas/cv-parser-v1.yaml")
|
|
110
|
+
|
|
111
|
+
# List all schemas in repo
|
|
112
|
+
schemas = fs.ls("git://schemas/")
|
|
113
|
+
|
|
114
|
+
# Check if file exists
|
|
115
|
+
if fs.exists("git://experiments/hello-world/ground_truth.csv"):
|
|
116
|
+
data = fs.read("git://experiments/hello-world/ground_truth.csv")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Integration with Agent Factory**:
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from rem.agentic.factory import create_agent
|
|
123
|
+
from rem.services.fs import FS
|
|
124
|
+
|
|
125
|
+
fs = FS()
|
|
126
|
+
|
|
127
|
+
# Load schema from git repo
|
|
128
|
+
schema_content = fs.read("git://schemas/cv-parser-v1.yaml?ref=v1.2.0")
|
|
129
|
+
|
|
130
|
+
# Create agent from versioned schema
|
|
131
|
+
agent = create_agent(schema_content)
|
|
132
|
+
|
|
133
|
+
# Run agent
|
|
134
|
+
result = await agent.run("Extract candidate from resume...")
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Performance Characteristics**:
|
|
138
|
+
- First clone: O(repo_size), typically 1-10 seconds for small repos
|
|
139
|
+
- Cached reads: O(1), local filesystem read
|
|
140
|
+
- Periodic sync: Configurable via GIT__SYNC_INTERVAL (default: 5 minutes)
|
|
141
|
+
- Shallow clones: --depth=1 reduces clone size by ~90% for large repos
|
|
142
|
+
|
|
143
|
+
**Security Considerations**:
|
|
144
|
+
- SSH keys stored in Kubernetes Secrets, not environment variables
|
|
145
|
+
- Use read-only deploy keys (GitHub: Settings → Deploy keys)
|
|
146
|
+
- Enable known_hosts verification to prevent MITM attacks
|
|
147
|
+
- Rotate PATs every 90 days (GitHub best practice)
|
|
148
|
+
- Use least-privilege principle: read-only access only
|
|
149
|
+
|
|
150
|
+
**Error Handling**:
|
|
151
|
+
- Authentication failures: Clear error messages with troubleshooting steps
|
|
152
|
+
- Network timeouts: Configurable timeout + exponential backoff
|
|
153
|
+
- Invalid refs: Fallback to default branch with warning
|
|
154
|
+
- Disk full: Clear old cached repos before cloning
|
|
155
|
+
|
|
156
|
+
**Future Enhancements**:
|
|
157
|
+
1. Git LFS support for large binary files (datasets, models)
|
|
158
|
+
2. Submodule support for shared schema libraries
|
|
159
|
+
3. Webhook-triggered sync (GitHub Actions → API call → immediate sync)
|
|
160
|
+
4. Metrics: clone time, cache hit rate, sync frequency
|
|
161
|
+
5. Multi-repo support: Multiple repos in single FS instance
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
from pathlib import Path
|
|
165
|
+
from typing import Any, BinaryIO, Iterator
|
|
166
|
+
import hashlib
|
|
167
|
+
import os
|
|
168
|
+
import shutil
|
|
169
|
+
from urllib.parse import urlparse, parse_qs
|
|
170
|
+
|
|
171
|
+
from loguru import logger
|
|
172
|
+
|
|
173
|
+
# Optional GitPython dependency
|
|
174
|
+
try:
|
|
175
|
+
from git import Repo, GitCommandError
|
|
176
|
+
from git.exc import InvalidGitRepositoryError, NoSuchPathError
|
|
177
|
+
GitPython_available = True
|
|
178
|
+
except ImportError:
|
|
179
|
+
GitPython_available = False
|
|
180
|
+
Repo = None # type: ignore[assignment,misc]
|
|
181
|
+
GitCommandError = Exception # type: ignore[assignment,misc]
|
|
182
|
+
InvalidGitRepositoryError = Exception # type: ignore[assignment,misc]
|
|
183
|
+
NoSuchPathError = Exception # type: ignore[assignment,misc]
|
|
184
|
+
|
|
185
|
+
from rem.settings import settings
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def is_git(uri: str) -> bool:
|
|
189
|
+
"""
|
|
190
|
+
Check if URI is a Git repository path.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
uri: URI to check
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
True if URI starts with git://, False otherwise
|
|
197
|
+
|
|
198
|
+
Examples:
|
|
199
|
+
>>> is_git("git://schemas/agent.yaml")
|
|
200
|
+
True
|
|
201
|
+
>>> is_git("s3://bucket/file.txt")
|
|
202
|
+
False
|
|
203
|
+
>>> is_git("/local/path/file.txt")
|
|
204
|
+
False
|
|
205
|
+
"""
|
|
206
|
+
return uri.startswith("git://")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def parse_git_uri(uri: str) -> tuple[str, str | None]:
|
|
210
|
+
"""
|
|
211
|
+
Parse Git URI into path and optional ref.
|
|
212
|
+
|
|
213
|
+
Git URIs support query parameters for specifying refs (branches, tags, commits):
|
|
214
|
+
- git://path/to/file.yaml - Uses default branch
|
|
215
|
+
- git://path/to/file.yaml?ref=v1.0.0 - Uses tag v1.0.0
|
|
216
|
+
- git://path/to/file.yaml?ref=feature-branch - Uses branch
|
|
217
|
+
- git://path/to/file.yaml?ref=abc123 - Uses commit hash
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
uri: Git URI (git://path/to/file.yaml?ref=tag)
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Tuple of (path, ref) where ref is None if not specified
|
|
224
|
+
|
|
225
|
+
Examples:
|
|
226
|
+
>>> parse_git_uri("git://schemas/agent.yaml")
|
|
227
|
+
('schemas/agent.yaml', None)
|
|
228
|
+
>>> parse_git_uri("git://schemas/agent.yaml?ref=v1.0.0")
|
|
229
|
+
('schemas/agent.yaml', 'v1.0.0')
|
|
230
|
+
>>> parse_git_uri("git://experiments/hello-world/?ref=main")
|
|
231
|
+
('experiments/hello-world/', 'main')
|
|
232
|
+
"""
|
|
233
|
+
# Remove git:// prefix
|
|
234
|
+
uri_without_scheme = uri[6:] # len("git://") = 6
|
|
235
|
+
|
|
236
|
+
# Split path and query string
|
|
237
|
+
if "?" in uri_without_scheme:
|
|
238
|
+
path, query = uri_without_scheme.split("?", 1)
|
|
239
|
+
# Parse query parameters
|
|
240
|
+
params = parse_qs(query)
|
|
241
|
+
ref = params.get("ref", [None])[0]
|
|
242
|
+
else:
|
|
243
|
+
path = uri_without_scheme
|
|
244
|
+
ref = None
|
|
245
|
+
|
|
246
|
+
return path, ref
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class GitProvider:
|
|
250
|
+
"""
|
|
251
|
+
Git repository provider for versioned schema and experiment syncing.
|
|
252
|
+
|
|
253
|
+
Provides filesystem-like interface to Git repositories with authentication,
|
|
254
|
+
caching, and sparse checkout support. Designed for Kubernetes environments
|
|
255
|
+
with proper secret management.
|
|
256
|
+
|
|
257
|
+
**Authentication Priority**:
|
|
258
|
+
1. SSH key (if GIT__SSH_KEY_PATH points to valid key)
|
|
259
|
+
2. Personal Access Token (if GIT__PERSONAL_ACCESS_TOKEN is set)
|
|
260
|
+
3. Unauthenticated (public repos only)
|
|
261
|
+
|
|
262
|
+
**Caching Strategy**:
|
|
263
|
+
- Clones cached in {cache_dir}/{repo_hash}/{ref}/
|
|
264
|
+
- Repo hash: SHA256 of repo URL (prevents collisions)
|
|
265
|
+
- Ref: branch, tag, or commit hash
|
|
266
|
+
- Cache invalidation: Manual via clear_cache() or periodic sync
|
|
267
|
+
|
|
268
|
+
**Thread Safety**:
|
|
269
|
+
- Local cache is thread-safe (atomic git operations)
|
|
270
|
+
- Concurrent reads: Safe
|
|
271
|
+
- Concurrent clones of same repo: Safe (GitPython handles locking)
|
|
272
|
+
|
|
273
|
+
**Resource Management**:
|
|
274
|
+
- Disk usage: ~100MB per repo (shallow clone)
|
|
275
|
+
- Memory: Minimal (lazy loading)
|
|
276
|
+
- Network: Only on first clone or refresh
|
|
277
|
+
|
|
278
|
+
Attributes:
|
|
279
|
+
repo_url: Git repository URL (SSH or HTTPS)
|
|
280
|
+
branch: Default branch to clone
|
|
281
|
+
cache_dir: Local cache directory for cloned repos
|
|
282
|
+
ssh_key_path: Path to SSH private key
|
|
283
|
+
known_hosts_path: Path to SSH known_hosts file
|
|
284
|
+
shallow: Use shallow clone (--depth=1)
|
|
285
|
+
|
|
286
|
+
Examples:
|
|
287
|
+
>>> provider = GitProvider()
|
|
288
|
+
>>> provider.exists("schemas/cv-parser-v1.yaml")
|
|
289
|
+
True
|
|
290
|
+
>>> schema = provider.read("schemas/cv-parser-v1.yaml")
|
|
291
|
+
>>> schemas = provider.ls("schemas/")
|
|
292
|
+
['schemas/agent-1.yaml', 'schemas/agent-2.yaml']
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
def __init__(
|
|
296
|
+
self,
|
|
297
|
+
repo_url: str | None = None,
|
|
298
|
+
branch: str | None = None,
|
|
299
|
+
cache_dir: str | None = None,
|
|
300
|
+
):
|
|
301
|
+
"""
|
|
302
|
+
Initialize Git provider with repository configuration.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
repo_url: Git repository URL (uses settings.git.default_repo_url if None)
|
|
306
|
+
branch: Default branch to clone (uses settings.git.default_branch if None)
|
|
307
|
+
cache_dir: Cache directory (uses settings.git.cache_dir if None)
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
ImportError: If GitPython is not installed
|
|
311
|
+
ValueError: If repo_url is not provided and settings.git.default_repo_url is None
|
|
312
|
+
"""
|
|
313
|
+
if not GitPython_available:
|
|
314
|
+
raise ImportError(
|
|
315
|
+
"GitPython is required for Git provider. Install with: pip install GitPython"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
self.repo_url = repo_url or settings.git.default_repo_url
|
|
319
|
+
if not self.repo_url:
|
|
320
|
+
raise ValueError(
|
|
321
|
+
"Git repository URL not provided. Set GIT__DEFAULT_REPO_URL or pass repo_url argument."
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Type guard: repo_url is guaranteed to be str after the check above
|
|
325
|
+
assert self.repo_url is not None
|
|
326
|
+
|
|
327
|
+
self.branch = branch or settings.git.default_branch
|
|
328
|
+
self.cache_dir = Path(cache_dir or settings.git.cache_dir)
|
|
329
|
+
self.ssh_key_path = settings.git.ssh_key_path
|
|
330
|
+
self.known_hosts_path = settings.git.known_hosts_path
|
|
331
|
+
self.shallow = settings.git.shallow_clone
|
|
332
|
+
|
|
333
|
+
# Create cache directory if it doesn't exist
|
|
334
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
335
|
+
|
|
336
|
+
# Compute repo hash for cache key
|
|
337
|
+
self.repo_hash = hashlib.sha256(self.repo_url.encode()).hexdigest()[:16]
|
|
338
|
+
|
|
339
|
+
logger.debug(
|
|
340
|
+
f"Initialized GitProvider: repo={self.repo_url}, "
|
|
341
|
+
f"branch={self.branch}, cache={self.cache_dir}"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
def _get_cached_repo_path(self, ref: str | None = None) -> Path:
|
|
345
|
+
"""
|
|
346
|
+
Get local path for cached repository.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
ref: Git ref (branch, tag, or commit). Uses default branch if None.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Path to local cached repository
|
|
353
|
+
|
|
354
|
+
Examples:
|
|
355
|
+
>>> provider._get_cached_repo_path()
|
|
356
|
+
Path('/tmp/rem-git-cache/a1b2c3d4e5f6/main')
|
|
357
|
+
>>> provider._get_cached_repo_path('v1.0.0')
|
|
358
|
+
Path('/tmp/rem-git-cache/a1b2c3d4e5f6/v1.0.0')
|
|
359
|
+
"""
|
|
360
|
+
ref = ref or self.branch
|
|
361
|
+
return self.cache_dir / self.repo_hash / ref
|
|
362
|
+
|
|
363
|
+
def _setup_git_ssh(self) -> dict[str, str]:
|
|
364
|
+
"""
|
|
365
|
+
Configure Git SSH authentication via environment variables.
|
|
366
|
+
|
|
367
|
+
Sets GIT_SSH_COMMAND to use custom SSH key and known_hosts file.
|
|
368
|
+
This approach works with GitPython's subprocess calls.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
Environment variables dict for Git commands
|
|
372
|
+
|
|
373
|
+
Raises:
|
|
374
|
+
FileNotFoundError: If SSH key or known_hosts file doesn't exist
|
|
375
|
+
|
|
376
|
+
Examples:
|
|
377
|
+
>>> env = provider._setup_git_ssh()
|
|
378
|
+
>>> env['GIT_SSH_COMMAND']
|
|
379
|
+
'ssh -i /etc/git-secret/ssh -o UserKnownHostsFile=/etc/git-secret/known_hosts -o StrictHostKeyChecking=yes'
|
|
380
|
+
"""
|
|
381
|
+
env = os.environ.copy()
|
|
382
|
+
|
|
383
|
+
# Check if SSH key exists
|
|
384
|
+
if Path(self.ssh_key_path).exists():
|
|
385
|
+
ssh_command = (
|
|
386
|
+
f"ssh -i {self.ssh_key_path} "
|
|
387
|
+
f"-o UserKnownHostsFile={self.known_hosts_path} "
|
|
388
|
+
f"-o StrictHostKeyChecking=yes"
|
|
389
|
+
)
|
|
390
|
+
env["GIT_SSH_COMMAND"] = ssh_command
|
|
391
|
+
logger.debug(f"Configured Git SSH: key={self.ssh_key_path}")
|
|
392
|
+
else:
|
|
393
|
+
logger.warning(
|
|
394
|
+
f"SSH key not found at {self.ssh_key_path}. "
|
|
395
|
+
"Falling back to default Git authentication."
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return env
|
|
399
|
+
|
|
400
|
+
def _setup_git_https(self, repo_url: str) -> str:
|
|
401
|
+
"""
|
|
402
|
+
Configure HTTPS authentication with Personal Access Token.
|
|
403
|
+
|
|
404
|
+
Injects PAT into HTTPS URL for authentication:
|
|
405
|
+
https://github.com/org/repo.git → https://{token}@github.com/org/repo.git
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
repo_url: Original HTTPS repository URL
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
Modified URL with embedded PAT
|
|
412
|
+
|
|
413
|
+
Security Note:
|
|
414
|
+
PAT is not logged or exposed in error messages.
|
|
415
|
+
|
|
416
|
+
Examples:
|
|
417
|
+
>>> provider._setup_git_https("https://github.com/org/repo.git")
|
|
418
|
+
'https://ghp_token123@github.com/org/repo.git'
|
|
419
|
+
"""
|
|
420
|
+
token = settings.git.personal_access_token
|
|
421
|
+
if not token:
|
|
422
|
+
logger.warning("No Personal Access Token configured for HTTPS authentication.")
|
|
423
|
+
return repo_url
|
|
424
|
+
|
|
425
|
+
# Parse URL and inject token
|
|
426
|
+
parsed = urlparse(repo_url)
|
|
427
|
+
if parsed.scheme in ("https", "http"):
|
|
428
|
+
# https://github.com/org/repo.git → https://token@github.com/org/repo.git
|
|
429
|
+
authed_url = f"{parsed.scheme}://{token}@{parsed.netloc}{parsed.path}"
|
|
430
|
+
logger.debug("Configured Git HTTPS authentication with PAT")
|
|
431
|
+
return authed_url
|
|
432
|
+
|
|
433
|
+
return repo_url
|
|
434
|
+
|
|
435
|
+
def _clone_or_update(self, ref: str | None = None) -> Repo:
|
|
436
|
+
"""
|
|
437
|
+
Clone repository or update existing clone.
|
|
438
|
+
|
|
439
|
+
Handles both initial cloning and updating existing repositories.
|
|
440
|
+
Uses shallow clones (--depth=1) for performance when enabled.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
ref: Git ref to checkout (branch, tag, or commit)
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
GitPython Repo object
|
|
447
|
+
|
|
448
|
+
Raises:
|
|
449
|
+
GitCommandError: If clone/checkout fails
|
|
450
|
+
FileNotFoundError: If SSH key is required but not found
|
|
451
|
+
|
|
452
|
+
Workflow:
|
|
453
|
+
1. Check if repo already cloned
|
|
454
|
+
2. If exists: fetch + checkout ref
|
|
455
|
+
3. If not: clone with ref
|
|
456
|
+
4. Return Repo object
|
|
457
|
+
|
|
458
|
+
Examples:
|
|
459
|
+
>>> repo = provider._clone_or_update("v1.0.0")
|
|
460
|
+
>>> repo.head.commit
|
|
461
|
+
<git.Commit "abc123...">
|
|
462
|
+
"""
|
|
463
|
+
repo_path = self._get_cached_repo_path(ref)
|
|
464
|
+
ref = ref or self.branch
|
|
465
|
+
|
|
466
|
+
# Setup authentication
|
|
467
|
+
env = self._setup_git_ssh()
|
|
468
|
+
repo_url = self.repo_url
|
|
469
|
+
|
|
470
|
+
if not repo_url:
|
|
471
|
+
raise ValueError("repo_url is required for cloning")
|
|
472
|
+
|
|
473
|
+
# If HTTPS, inject PAT
|
|
474
|
+
if repo_url.startswith("https://") or repo_url.startswith("http://"):
|
|
475
|
+
repo_url = self._setup_git_https(repo_url)
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
if repo_path.exists():
|
|
479
|
+
# Repository already cloned, update it
|
|
480
|
+
logger.debug(f"Updating existing repo at {repo_path}")
|
|
481
|
+
repo = Repo(repo_path)
|
|
482
|
+
|
|
483
|
+
# Fetch latest changes
|
|
484
|
+
repo.remotes.origin.fetch(env=env)
|
|
485
|
+
|
|
486
|
+
# Checkout requested ref
|
|
487
|
+
try:
|
|
488
|
+
repo.git.checkout(ref, env=env)
|
|
489
|
+
logger.info(f"Checked out ref: {ref}")
|
|
490
|
+
except GitCommandError as e:
|
|
491
|
+
logger.warning(
|
|
492
|
+
f"Failed to checkout ref '{ref}': {e}. "
|
|
493
|
+
f"Falling back to default branch '{self.branch}'"
|
|
494
|
+
)
|
|
495
|
+
repo.git.checkout(self.branch, env=env)
|
|
496
|
+
|
|
497
|
+
return repo
|
|
498
|
+
else:
|
|
499
|
+
# Clone repository
|
|
500
|
+
logger.info(f"Cloning repo {self.repo_url} to {repo_path}")
|
|
501
|
+
repo_path.mkdir(parents=True, exist_ok=True)
|
|
502
|
+
|
|
503
|
+
# Clone with explicit arguments (mypy-safe)
|
|
504
|
+
if self.shallow:
|
|
505
|
+
logger.debug("Using shallow clone (--depth=1)")
|
|
506
|
+
repo = Repo.clone_from(
|
|
507
|
+
repo_url,
|
|
508
|
+
to_path=str(repo_path),
|
|
509
|
+
branch=ref,
|
|
510
|
+
env=env,
|
|
511
|
+
depth=1,
|
|
512
|
+
)
|
|
513
|
+
else:
|
|
514
|
+
repo = Repo.clone_from(
|
|
515
|
+
repo_url,
|
|
516
|
+
to_path=str(repo_path),
|
|
517
|
+
branch=ref,
|
|
518
|
+
env=env,
|
|
519
|
+
)
|
|
520
|
+
logger.info(f"Successfully cloned repo to {repo_path}")
|
|
521
|
+
|
|
522
|
+
return repo
|
|
523
|
+
|
|
524
|
+
except GitCommandError as e:
|
|
525
|
+
logger.error(f"Git operation failed: {e}")
|
|
526
|
+
raise
|
|
527
|
+
|
|
528
|
+
def _get_local_path(self, path: str, ref: str | None = None) -> Path:
|
|
529
|
+
"""
|
|
530
|
+
Get local filesystem path for a Git repository file.
|
|
531
|
+
|
|
532
|
+
Clones repo if needed, then returns path to file in cached repo.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
path: Path within repository (e.g., "schemas/agent.yaml")
|
|
536
|
+
ref: Git ref to checkout
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Local filesystem path to file
|
|
540
|
+
|
|
541
|
+
Raises:
|
|
542
|
+
FileNotFoundError: If file doesn't exist in repo
|
|
543
|
+
|
|
544
|
+
Examples:
|
|
545
|
+
>>> provider._get_local_path("schemas/agent.yaml", "v1.0.0")
|
|
546
|
+
Path('/tmp/rem-git-cache/a1b2c3d4/v1.0.0/schemas/agent.yaml')
|
|
547
|
+
"""
|
|
548
|
+
repo = self._clone_or_update(ref)
|
|
549
|
+
local_path = Path(repo.working_dir) / path
|
|
550
|
+
|
|
551
|
+
if not local_path.exists():
|
|
552
|
+
raise FileNotFoundError(
|
|
553
|
+
f"Path '{path}' not found in Git repository at ref '{ref or self.branch}'"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
return local_path
|
|
557
|
+
|
|
558
|
+
def exists(self, uri: str) -> bool:
|
|
559
|
+
"""
|
|
560
|
+
Check if file or directory exists in Git repository.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
uri: Git URI (git://path/to/file.yaml?ref=tag)
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
True if path exists in repo, False otherwise
|
|
567
|
+
|
|
568
|
+
Examples:
|
|
569
|
+
>>> provider.exists("git://schemas/agent.yaml")
|
|
570
|
+
True
|
|
571
|
+
>>> provider.exists("git://schemas/agent.yaml?ref=v1.0.0")
|
|
572
|
+
True
|
|
573
|
+
>>> provider.exists("git://nonexistent.yaml")
|
|
574
|
+
False
|
|
575
|
+
"""
|
|
576
|
+
path, ref = parse_git_uri(uri)
|
|
577
|
+
|
|
578
|
+
try:
|
|
579
|
+
local_path = self._get_local_path(path, ref)
|
|
580
|
+
return local_path.exists()
|
|
581
|
+
except (FileNotFoundError, GitCommandError):
|
|
582
|
+
return False
|
|
583
|
+
|
|
584
|
+
def read(self, uri: str, **options) -> Any:
|
|
585
|
+
"""
|
|
586
|
+
Read file from Git repository.
|
|
587
|
+
|
|
588
|
+
Supports same format detection as LocalProvider:
|
|
589
|
+
- YAML: .yaml, .yml
|
|
590
|
+
- JSON: .json
|
|
591
|
+
- CSV: .csv
|
|
592
|
+
- Text: .txt, .md
|
|
593
|
+
- Binary: .pdf, .docx, .png, etc.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
uri: Git URI (git://path/to/file.yaml?ref=tag)
|
|
597
|
+
**options: Format-specific read options
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Parsed file content
|
|
601
|
+
|
|
602
|
+
Raises:
|
|
603
|
+
FileNotFoundError: If file doesn't exist in repo
|
|
604
|
+
ValueError: If file format is unsupported
|
|
605
|
+
|
|
606
|
+
Examples:
|
|
607
|
+
>>> schema = provider.read("git://schemas/agent.yaml")
|
|
608
|
+
>>> data = provider.read("git://experiments/data.csv")
|
|
609
|
+
>>> image = provider.read("git://assets/logo.png")
|
|
610
|
+
"""
|
|
611
|
+
path, ref = parse_git_uri(uri)
|
|
612
|
+
local_path = self._get_local_path(path, ref)
|
|
613
|
+
|
|
614
|
+
# Delegate to LocalProvider for format handling
|
|
615
|
+
from rem.services.fs.local_provider import LocalProvider
|
|
616
|
+
|
|
617
|
+
local_provider = LocalProvider()
|
|
618
|
+
return local_provider.read(str(local_path), **options)
|
|
619
|
+
|
|
620
|
+
def ls(self, uri: str, **options) -> list[str]:
|
|
621
|
+
"""
|
|
622
|
+
List files in Git repository directory.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
uri: Git URI (git://path/to/dir/?ref=tag)
|
|
626
|
+
**options: Provider options
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
List of file paths (relative to repo root)
|
|
630
|
+
|
|
631
|
+
Examples:
|
|
632
|
+
>>> provider.ls("git://schemas/")
|
|
633
|
+
['schemas/agent-1.yaml', 'schemas/agent-2.yaml']
|
|
634
|
+
>>> provider.ls("git://experiments/hello-world/?ref=v1.0.0")
|
|
635
|
+
['experiments/hello-world/ground_truth.csv', 'experiments/hello-world/config.yaml']
|
|
636
|
+
"""
|
|
637
|
+
path, ref = parse_git_uri(uri)
|
|
638
|
+
local_path = self._get_local_path(path, ref)
|
|
639
|
+
|
|
640
|
+
if not local_path.is_dir():
|
|
641
|
+
raise ValueError(f"Path '{path}' is not a directory")
|
|
642
|
+
|
|
643
|
+
# List all files recursively
|
|
644
|
+
files = []
|
|
645
|
+
repo_root = self._get_cached_repo_path(ref)
|
|
646
|
+
|
|
647
|
+
for file_path in local_path.rglob("*"):
|
|
648
|
+
if file_path.is_file():
|
|
649
|
+
# Make path relative to repo root
|
|
650
|
+
relative = file_path.relative_to(repo_root)
|
|
651
|
+
files.append(str(relative))
|
|
652
|
+
|
|
653
|
+
return sorted(files)
|
|
654
|
+
|
|
655
|
+
def ls_iter(self, uri: str, **options) -> Iterator[str]:
|
|
656
|
+
"""
|
|
657
|
+
Iterate over files in Git repository directory.
|
|
658
|
+
|
|
659
|
+
Args:
|
|
660
|
+
uri: Git URI (git://path/to/dir/?ref=tag)
|
|
661
|
+
**options: Provider options
|
|
662
|
+
|
|
663
|
+
Yields:
|
|
664
|
+
File paths (relative to repo root)
|
|
665
|
+
|
|
666
|
+
Examples:
|
|
667
|
+
>>> for file in provider.ls_iter("git://schemas/"):
|
|
668
|
+
... print(file)
|
|
669
|
+
schemas/agent-1.yaml
|
|
670
|
+
schemas/agent-2.yaml
|
|
671
|
+
"""
|
|
672
|
+
for file_path in self.ls(uri, **options):
|
|
673
|
+
yield file_path
|
|
674
|
+
|
|
675
|
+
def clear_cache(self, ref: str | None = None):
|
|
676
|
+
"""
|
|
677
|
+
Clear cached repository.
|
|
678
|
+
|
|
679
|
+
Useful for:
|
|
680
|
+
- Forcing fresh clone
|
|
681
|
+
- Freeing disk space
|
|
682
|
+
- Testing
|
|
683
|
+
|
|
684
|
+
Args:
|
|
685
|
+
ref: Specific ref to clear, or None to clear all refs
|
|
686
|
+
|
|
687
|
+
Examples:
|
|
688
|
+
>>> provider.clear_cache("v1.0.0") # Clear specific tag
|
|
689
|
+
>>> provider.clear_cache() # Clear all refs
|
|
690
|
+
"""
|
|
691
|
+
if ref:
|
|
692
|
+
repo_path = self._get_cached_repo_path(ref)
|
|
693
|
+
if repo_path.exists():
|
|
694
|
+
shutil.rmtree(repo_path)
|
|
695
|
+
logger.info(f"Cleared cache for ref: {ref}")
|
|
696
|
+
else:
|
|
697
|
+
repo_base = self.cache_dir / self.repo_hash
|
|
698
|
+
if repo_base.exists():
|
|
699
|
+
shutil.rmtree(repo_base)
|
|
700
|
+
logger.info(f"Cleared all cached refs for repo: {self.repo_url}")
|
|
701
|
+
|
|
702
|
+
def get_current_commit(self, ref: str | None = None) -> str:
|
|
703
|
+
"""
|
|
704
|
+
Get current commit hash for ref.
|
|
705
|
+
|
|
706
|
+
Useful for tracking which version of schema is currently loaded.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
ref: Git ref (branch, tag, or commit)
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
Full commit hash (40 characters)
|
|
713
|
+
|
|
714
|
+
Examples:
|
|
715
|
+
>>> provider.get_current_commit("v1.0.0")
|
|
716
|
+
'abc123def456...'
|
|
717
|
+
>>> provider.get_current_commit() # Current branch
|
|
718
|
+
'def456abc123...'
|
|
719
|
+
"""
|
|
720
|
+
repo = self._clone_or_update(ref)
|
|
721
|
+
return repo.head.commit.hexsha
|
|
722
|
+
|
|
723
|
+
def get_semantic_versions(self, file_path: str, pattern: str | None = None) -> list[dict[str, Any]]:
|
|
724
|
+
"""
|
|
725
|
+
Get semantic version history for a file following Git tags.
|
|
726
|
+
|
|
727
|
+
Returns list of versions where the file exists, sorted by semantic versioning.
|
|
728
|
+
Useful for tracking schema evolution, comparing agent versions, and
|
|
729
|
+
understanding when changes were introduced.
|
|
730
|
+
|
|
731
|
+
**Semantic Versioning** (semver.org):
|
|
732
|
+
- Format: MAJOR.MINOR.PATCH (e.g., 2.1.0, 2.1.1, 3.0.0)
|
|
733
|
+
- MAJOR: Breaking changes
|
|
734
|
+
- MINOR: New features (backwards compatible)
|
|
735
|
+
- PATCH: Bug fixes (backwards compatible)
|
|
736
|
+
|
|
737
|
+
**Use Cases**:
|
|
738
|
+
1. **Schema Evolution Tracking**:
|
|
739
|
+
- Compare cv-parser v2.1.0 vs v2.1.1
|
|
740
|
+
- Identify breaking changes (MAJOR version bumps)
|
|
741
|
+
- Review feature additions (MINOR version bumps)
|
|
742
|
+
|
|
743
|
+
2. **Rollback/Pinning**:
|
|
744
|
+
- Production uses v2.1.0 (stable)
|
|
745
|
+
- Staging tests v2.1.1 (latest)
|
|
746
|
+
- Can rollback to v2.0.0 if needed
|
|
747
|
+
|
|
748
|
+
3. **Deprecation Management**:
|
|
749
|
+
- Mark v1.x.x as deprecated
|
|
750
|
+
- Migrate users to v2.x.x
|
|
751
|
+
- Track adoption rate by version
|
|
752
|
+
|
|
753
|
+
Args:
|
|
754
|
+
file_path: Path to file in repository (e.g., "schemas/agent.yaml")
|
|
755
|
+
pattern: Optional regex pattern for tag filtering (e.g., "v2\\..*" for v2.x.x)
|
|
756
|
+
|
|
757
|
+
Returns:
|
|
758
|
+
List of version dicts sorted by semantic version (newest first):
|
|
759
|
+
[
|
|
760
|
+
{
|
|
761
|
+
"tag": "v2.1.1",
|
|
762
|
+
"version": (2, 1, 1),
|
|
763
|
+
"commit": "abc123...",
|
|
764
|
+
"date": "2025-01-15T10:30:00",
|
|
765
|
+
"message": "feat: Add confidence scoring",
|
|
766
|
+
"author": "alice@example.com"
|
|
767
|
+
},
|
|
768
|
+
{
|
|
769
|
+
"tag": "v2.1.0",
|
|
770
|
+
"version": (2, 1, 0),
|
|
771
|
+
"commit": "def456...",
|
|
772
|
+
"date": "2025-01-10T14:20:00",
|
|
773
|
+
"message": "feat: Add multi-language support",
|
|
774
|
+
"author": "bob@example.com"
|
|
775
|
+
}
|
|
776
|
+
]
|
|
777
|
+
|
|
778
|
+
Raises:
|
|
779
|
+
FileNotFoundError: If file doesn't exist in any tagged version
|
|
780
|
+
|
|
781
|
+
Examples:
|
|
782
|
+
>>> # Get all versions of a schema
|
|
783
|
+
>>> versions = provider.get_semantic_versions("schemas/cv-parser.yaml")
|
|
784
|
+
>>> print(f"Current: {versions[0]['tag']}, Previous: {versions[1]['tag']}")
|
|
785
|
+
Current: v2.1.1, Previous: v2.1.0
|
|
786
|
+
|
|
787
|
+
>>> # Get only v2.x.x versions
|
|
788
|
+
>>> v2_versions = provider.get_semantic_versions(
|
|
789
|
+
... "schemas/cv-parser.yaml",
|
|
790
|
+
... pattern="v2\\..*"
|
|
791
|
+
... )
|
|
792
|
+
|
|
793
|
+
>>> # Compare two versions
|
|
794
|
+
>>> v1 = provider.read(f"git://schemas/cv-parser.yaml?ref={versions[0]['tag']}")
|
|
795
|
+
>>> v2 = provider.read(f"git://schemas/cv-parser.yaml?ref={versions[1]['tag']}")
|
|
796
|
+
>>> # Diff logic here...
|
|
797
|
+
|
|
798
|
+
>>> # Find version by date
|
|
799
|
+
>>> target_date = "2025-01-12"
|
|
800
|
+
>>> version = next(v for v in versions if v["date"].startswith(target_date))
|
|
801
|
+
>>> print(version["tag"])
|
|
802
|
+
v2.1.0
|
|
803
|
+
"""
|
|
804
|
+
import re
|
|
805
|
+
from datetime import datetime
|
|
806
|
+
|
|
807
|
+
repo = self._clone_or_update()
|
|
808
|
+
|
|
809
|
+
# Get all tags from repository
|
|
810
|
+
tags = repo.tags
|
|
811
|
+
|
|
812
|
+
if not tags:
|
|
813
|
+
logger.warning(f"No tags found in repository {self.repo_url}")
|
|
814
|
+
return []
|
|
815
|
+
|
|
816
|
+
versions = []
|
|
817
|
+
# Pattern supports both flat tags (v2.1.0) and path-based tags (schemas/test/v2.1.0)
|
|
818
|
+
semver_pattern = re.compile(r"(?:^|/)v?(\d+)\.(\d+)\.(\d+)")
|
|
819
|
+
|
|
820
|
+
for tag in tags:
|
|
821
|
+
tag_name = tag.name
|
|
822
|
+
|
|
823
|
+
# Apply user-provided pattern filter
|
|
824
|
+
if pattern and not re.search(pattern, tag_name):
|
|
825
|
+
continue
|
|
826
|
+
|
|
827
|
+
# Extract semantic version (MAJOR.MINOR.PATCH)
|
|
828
|
+
match = semver_pattern.search(tag_name)
|
|
829
|
+
if not match:
|
|
830
|
+
continue # Skip non-semver tags
|
|
831
|
+
|
|
832
|
+
major, minor, patch = map(int, match.groups())
|
|
833
|
+
|
|
834
|
+
# Check if file exists at this tag
|
|
835
|
+
try:
|
|
836
|
+
repo.git.checkout(tag_name)
|
|
837
|
+
full_path = Path(repo.working_dir) / file_path
|
|
838
|
+
|
|
839
|
+
if not full_path.exists():
|
|
840
|
+
continue # File doesn't exist in this version
|
|
841
|
+
|
|
842
|
+
# Get commit info for this tag
|
|
843
|
+
commit = tag.commit
|
|
844
|
+
commit_date = datetime.fromtimestamp(commit.committed_date)
|
|
845
|
+
|
|
846
|
+
versions.append({
|
|
847
|
+
"tag": tag_name,
|
|
848
|
+
"version": (major, minor, patch),
|
|
849
|
+
"commit": commit.hexsha,
|
|
850
|
+
"date": commit_date.isoformat(),
|
|
851
|
+
"message": commit.message.strip(),
|
|
852
|
+
"author": commit.author.email,
|
|
853
|
+
})
|
|
854
|
+
|
|
855
|
+
except (GitCommandError, FileNotFoundError):
|
|
856
|
+
continue
|
|
857
|
+
|
|
858
|
+
# Sort by semantic version (newest first)
|
|
859
|
+
versions.sort(key=lambda v: v["version"], reverse=True)
|
|
860
|
+
|
|
861
|
+
# Restore to default branch
|
|
862
|
+
repo.git.checkout(self.branch)
|
|
863
|
+
|
|
864
|
+
logger.info(
|
|
865
|
+
f"Found {len(versions)} semantic versions for {file_path} "
|
|
866
|
+
f"(pattern: {pattern or 'all'})"
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
return versions
|
|
870
|
+
|
|
871
|
+
def diff_versions(
|
|
872
|
+
self,
|
|
873
|
+
file_path: str,
|
|
874
|
+
version1: str,
|
|
875
|
+
version2: str,
|
|
876
|
+
unified: int = 3
|
|
877
|
+
) -> str:
|
|
878
|
+
"""
|
|
879
|
+
Generate unified diff between two versions of a file.
|
|
880
|
+
|
|
881
|
+
Useful for:
|
|
882
|
+
- Code review: What changed between v2.1.0 and v2.1.1?
|
|
883
|
+
- Migration planning: Breaking changes from v1.x.x to v2.x.x?
|
|
884
|
+
- Audit trail: Who changed what and when?
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
file_path: Path to file in repository
|
|
888
|
+
version1: First version tag (e.g., "v2.1.0")
|
|
889
|
+
version2: Second version tag (e.g., "v2.1.1")
|
|
890
|
+
unified: Number of context lines (default: 3)
|
|
891
|
+
|
|
892
|
+
Returns:
|
|
893
|
+
Unified diff string (Git format)
|
|
894
|
+
|
|
895
|
+
Examples:
|
|
896
|
+
>>> # Compare adjacent versions
|
|
897
|
+
>>> diff = provider.diff_versions(
|
|
898
|
+
... "schemas/cv-parser.yaml",
|
|
899
|
+
... "v2.1.0",
|
|
900
|
+
... "v2.1.1"
|
|
901
|
+
... )
|
|
902
|
+
>>> print(diff)
|
|
903
|
+
--- a/schemas/cv-parser.yaml
|
|
904
|
+
+++ b/schemas/cv-parser.yaml
|
|
905
|
+
@@ -10,6 +10,7 @@
|
|
906
|
+
skills:
|
|
907
|
+
type: array
|
|
908
|
+
+ description: Candidate technical skills
|
|
909
|
+
experience:
|
|
910
|
+
type: array
|
|
911
|
+
|
|
912
|
+
>>> # Check for breaking changes
|
|
913
|
+
>>> if "required:" in diff and "-" in diff:
|
|
914
|
+
... print("⚠️ Breaking change: Required field removed")
|
|
915
|
+
"""
|
|
916
|
+
repo = self._clone_or_update()
|
|
917
|
+
|
|
918
|
+
try:
|
|
919
|
+
# Generate diff using git diff command
|
|
920
|
+
diff_output = repo.git.diff(
|
|
921
|
+
version1,
|
|
922
|
+
version2,
|
|
923
|
+
"--",
|
|
924
|
+
file_path,
|
|
925
|
+
unified=unified
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
return diff_output
|
|
929
|
+
|
|
930
|
+
except GitCommandError as e:
|
|
931
|
+
logger.error(f"Failed to generate diff: {e}")
|
|
932
|
+
raise ValueError(
|
|
933
|
+
f"Could not diff {file_path} between {version1} and {version2}. "
|
|
934
|
+
"Ensure both tags exist and file is present in both versions."
|
|
935
|
+
)
|