remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,935 @@
1
+ """
2
+ Git repository provider for versioned schema and experiment syncing.
3
+
4
+ Enables REM to sync agent schemas, evaluators, and experiments from Git repositories
5
+ using SSH or HTTPS authentication. Designed for Kubernetes cluster environments with
6
+ proper secret management via Kubernetes Secrets or IRSA/Workload Identity.
7
+
8
+ **Architecture Pattern**: git-sync sidecar
9
+ - Primary use case: Kubernetes pods with git-sync sidecar container
10
+ - Alternative: Direct cloning from application code (this implementation)
11
+ - Caching: Local filesystem cache to minimize network traffic
12
+
13
+ **Use Cases**:
14
+ 1. **Agent Schema Versioning**:
15
+ - Sync agent schemas from git://repo/schemas/
16
+ - Checkout specific tags/releases for reproducible builds
17
+ - Multi-environment: dev uses main branch, prod uses release tags
18
+
19
+ 2. **Experiment Tracking**:
20
+ - Store evaluation datasets in git://repo/experiments/
21
+ - Version control for ground truth data
22
+ - CI/CD integration: commit → test → deploy
23
+
24
+ 3. **Multi-Tenancy**:
25
+ - Different tenants use different repos/branches
26
+ - Tenant-specific schema overrides
27
+ - Centralized schema library with tenant customization
28
+
29
+ **Authentication Methods**:
30
+
31
+ 1. **SSH (Production Recommended)**:
32
+ - Uses SSH keys from Kubernetes Secrets
33
+ - Key stored at /etc/git-secret/ssh (0400 permissions)
34
+ - Known hosts at /etc/git-secret/known_hosts
35
+ - No rate limits, full Git protocol support
36
+ - Example URL: ssh://git@github.com/org/repo.git
37
+
38
+ 2. **HTTPS with Personal Access Token**:
39
+ - GitHub PAT: 5,000 API requests/hour per authenticated user
40
+ - GitLab PAT: Similar rate limits
41
+ - Easier local development setup
42
+ - Example URL: https://github.com/org/repo.git
43
+
44
+ **Kubernetes Secret Management**:
45
+
46
+ ```bash
47
+ # Create secret with SSH key and known_hosts
48
+ kubectl create secret generic git-creds \\
49
+ --from-file=ssh=$HOME/.ssh/id_rsa \\
50
+ --from-file=known_hosts=$HOME/.ssh/known_hosts
51
+
52
+ # Pod spec
53
+ apiVersion: v1
54
+ kind: Pod
55
+ metadata:
56
+ name: rem-api
57
+ spec:
58
+ volumes:
59
+ - name: git-secret
60
+ secret:
61
+ secretName: git-creds
62
+ defaultMode: 0400 # Read-only for owner
63
+ containers:
64
+ - name: rem-api
65
+ image: rem-api:latest
66
+ volumeMounts:
67
+ - name: git-secret
68
+ mountPath: /etc/git-secret
69
+ readOnly: true
70
+ securityContext:
71
+ fsGroup: 65533 # git user group
72
+ env:
73
+ - name: GIT__ENABLED
74
+ value: "true"
75
+ - name: GIT__DEFAULT_REPO_URL
76
+ value: "ssh://git@github.com/my-org/my-repo.git"
77
+ - name: GIT__SSH_KEY_PATH
78
+ value: "/etc/git-secret/ssh"
79
+ ```
80
+
81
+ **Path Conventions**:
82
+ - URI format: git://repo_url/path/to/file.yaml
83
+ - Local cache: {cache_dir}/{repo_hash}/{path/to/file.yaml}
84
+ - Agent schemas: git://repo/schemas/agent-name.yaml
85
+ - Experiments: git://repo/experiments/experiment-name/
86
+ - Evaluators: git://repo/schemas/evaluators/evaluator-name.yaml
87
+
88
+ **Sparse Checkout** (Future Enhancement):
89
+ - Only checkout specific directories (schemas/, experiments/)
90
+ - Reduces clone size for large mono-repos
91
+ - Faster sync times
92
+
93
+ **Examples**:
94
+
95
+ ```python
96
+ from rem.services.fs import FS
97
+ from rem.settings import settings
98
+
99
+ # Enable Git provider
100
+ settings.git.enabled = True
101
+ settings.git.default_repo_url = "ssh://git@github.com/org/repo.git"
102
+
103
+ fs = FS()
104
+
105
+ # Read agent schema from git repo at specific tag
106
+ schema = fs.read("git://schemas/cv-parser-v1.yaml?ref=v1.0.0")
107
+
108
+ # Read from main branch (default)
109
+ schema = fs.read("git://schemas/cv-parser-v1.yaml")
110
+
111
+ # List all schemas in repo
112
+ schemas = fs.ls("git://schemas/")
113
+
114
+ # Check if file exists
115
+ if fs.exists("git://experiments/hello-world/ground_truth.csv"):
116
+ data = fs.read("git://experiments/hello-world/ground_truth.csv")
117
+ ```
118
+
119
+ **Integration with Agent Factory**:
120
+
121
+ ```python
122
+ from rem.agentic.factory import create_agent
123
+ from rem.services.fs import FS
124
+
125
+ fs = FS()
126
+
127
+ # Load schema from git repo
128
+ schema_content = fs.read("git://schemas/cv-parser-v1.yaml?ref=v1.2.0")
129
+
130
+ # Create agent from versioned schema
131
+ agent = create_agent(schema_content)
132
+
133
+ # Run agent
134
+ result = await agent.run("Extract candidate from resume...")
135
+ ```
136
+
137
+ **Performance Characteristics**:
138
+ - First clone: O(repo_size), typically 1-10 seconds for small repos
139
+ - Cached reads: O(1), local filesystem read
140
+ - Periodic sync: Configurable via GIT__SYNC_INTERVAL (default: 5 minutes)
141
+ - Shallow clones: --depth=1 reduces clone size by ~90% for large repos
142
+
143
+ **Security Considerations**:
144
+ - SSH keys stored in Kubernetes Secrets, not environment variables
145
+ - Use read-only deploy keys (GitHub: Settings → Deploy keys)
146
+ - Enable known_hosts verification to prevent MITM attacks
147
+ - Rotate PATs every 90 days (GitHub best practice)
148
+ - Use least-privilege principle: read-only access only
149
+
150
+ **Error Handling**:
151
+ - Authentication failures: Clear error messages with troubleshooting steps
152
+ - Network timeouts: Configurable timeout + exponential backoff
153
+ - Invalid refs: Fallback to default branch with warning
154
+ - Disk full: Clear old cached repos before cloning
155
+
156
+ **Future Enhancements**:
157
+ 1. Git LFS support for large binary files (datasets, models)
158
+ 2. Submodule support for shared schema libraries
159
+ 3. Webhook-triggered sync (GitHub Actions → API call → immediate sync)
160
+ 4. Metrics: clone time, cache hit rate, sync frequency
161
+ 5. Multi-repo support: Multiple repos in single FS instance
162
+ """
163
+
164
+ from pathlib import Path
165
+ from typing import Any, BinaryIO, Iterator
166
+ import hashlib
167
+ import os
168
+ import shutil
169
+ from urllib.parse import urlparse, parse_qs
170
+
171
+ from loguru import logger
172
+
173
+ # Optional GitPython dependency
174
+ try:
175
+ from git import Repo, GitCommandError
176
+ from git.exc import InvalidGitRepositoryError, NoSuchPathError
177
+ GitPython_available = True
178
+ except ImportError:
179
+ GitPython_available = False
180
+ Repo = None # type: ignore[assignment,misc]
181
+ GitCommandError = Exception # type: ignore[assignment,misc]
182
+ InvalidGitRepositoryError = Exception # type: ignore[assignment,misc]
183
+ NoSuchPathError = Exception # type: ignore[assignment,misc]
184
+
185
+ from rem.settings import settings
186
+
187
+
188
+ def is_git(uri: str) -> bool:
189
+ """
190
+ Check if URI is a Git repository path.
191
+
192
+ Args:
193
+ uri: URI to check
194
+
195
+ Returns:
196
+ True if URI starts with git://, False otherwise
197
+
198
+ Examples:
199
+ >>> is_git("git://schemas/agent.yaml")
200
+ True
201
+ >>> is_git("s3://bucket/file.txt")
202
+ False
203
+ >>> is_git("/local/path/file.txt")
204
+ False
205
+ """
206
+ return uri.startswith("git://")
207
+
208
+
209
+ def parse_git_uri(uri: str) -> tuple[str, str | None]:
210
+ """
211
+ Parse Git URI into path and optional ref.
212
+
213
+ Git URIs support query parameters for specifying refs (branches, tags, commits):
214
+ - git://path/to/file.yaml - Uses default branch
215
+ - git://path/to/file.yaml?ref=v1.0.0 - Uses tag v1.0.0
216
+ - git://path/to/file.yaml?ref=feature-branch - Uses branch
217
+ - git://path/to/file.yaml?ref=abc123 - Uses commit hash
218
+
219
+ Args:
220
+ uri: Git URI (git://path/to/file.yaml?ref=tag)
221
+
222
+ Returns:
223
+ Tuple of (path, ref) where ref is None if not specified
224
+
225
+ Examples:
226
+ >>> parse_git_uri("git://schemas/agent.yaml")
227
+ ('schemas/agent.yaml', None)
228
+ >>> parse_git_uri("git://schemas/agent.yaml?ref=v1.0.0")
229
+ ('schemas/agent.yaml', 'v1.0.0')
230
+ >>> parse_git_uri("git://experiments/hello-world/?ref=main")
231
+ ('experiments/hello-world/', 'main')
232
+ """
233
+ # Remove git:// prefix
234
+ uri_without_scheme = uri[6:] # len("git://") = 6
235
+
236
+ # Split path and query string
237
+ if "?" in uri_without_scheme:
238
+ path, query = uri_without_scheme.split("?", 1)
239
+ # Parse query parameters
240
+ params = parse_qs(query)
241
+ ref = params.get("ref", [None])[0]
242
+ else:
243
+ path = uri_without_scheme
244
+ ref = None
245
+
246
+ return path, ref
247
+
248
+
249
+ class GitProvider:
250
+ """
251
+ Git repository provider for versioned schema and experiment syncing.
252
+
253
+ Provides filesystem-like interface to Git repositories with authentication,
254
+ caching, and sparse checkout support. Designed for Kubernetes environments
255
+ with proper secret management.
256
+
257
+ **Authentication Priority**:
258
+ 1. SSH key (if GIT__SSH_KEY_PATH points to valid key)
259
+ 2. Personal Access Token (if GIT__PERSONAL_ACCESS_TOKEN is set)
260
+ 3. Unauthenticated (public repos only)
261
+
262
+ **Caching Strategy**:
263
+ - Clones cached in {cache_dir}/{repo_hash}/{ref}/
264
+ - Repo hash: SHA256 of repo URL (prevents collisions)
265
+ - Ref: branch, tag, or commit hash
266
+ - Cache invalidation: Manual via clear_cache() or periodic sync
267
+
268
+ **Thread Safety**:
269
+ - Local cache is thread-safe (atomic git operations)
270
+ - Concurrent reads: Safe
271
+ - Concurrent clones of same repo: Safe (GitPython handles locking)
272
+
273
+ **Resource Management**:
274
+ - Disk usage: ~100MB per repo (shallow clone)
275
+ - Memory: Minimal (lazy loading)
276
+ - Network: Only on first clone or refresh
277
+
278
+ Attributes:
279
+ repo_url: Git repository URL (SSH or HTTPS)
280
+ branch: Default branch to clone
281
+ cache_dir: Local cache directory for cloned repos
282
+ ssh_key_path: Path to SSH private key
283
+ known_hosts_path: Path to SSH known_hosts file
284
+ shallow: Use shallow clone (--depth=1)
285
+
286
+ Examples:
287
+ >>> provider = GitProvider()
288
+ >>> provider.exists("schemas/cv-parser-v1.yaml")
289
+ True
290
+ >>> schema = provider.read("schemas/cv-parser-v1.yaml")
291
+ >>> schemas = provider.ls("schemas/")
292
+ ['schemas/agent-1.yaml', 'schemas/agent-2.yaml']
293
+ """
294
+
295
+ def __init__(
296
+ self,
297
+ repo_url: str | None = None,
298
+ branch: str | None = None,
299
+ cache_dir: str | None = None,
300
+ ):
301
+ """
302
+ Initialize Git provider with repository configuration.
303
+
304
+ Args:
305
+ repo_url: Git repository URL (uses settings.git.default_repo_url if None)
306
+ branch: Default branch to clone (uses settings.git.default_branch if None)
307
+ cache_dir: Cache directory (uses settings.git.cache_dir if None)
308
+
309
+ Raises:
310
+ ImportError: If GitPython is not installed
311
+ ValueError: If repo_url is not provided and settings.git.default_repo_url is None
312
+ """
313
+ if not GitPython_available:
314
+ raise ImportError(
315
+ "GitPython is required for Git provider. Install with: pip install GitPython"
316
+ )
317
+
318
+ self.repo_url = repo_url or settings.git.default_repo_url
319
+ if not self.repo_url:
320
+ raise ValueError(
321
+ "Git repository URL not provided. Set GIT__DEFAULT_REPO_URL or pass repo_url argument."
322
+ )
323
+
324
+ # Type guard: repo_url is guaranteed to be str after the check above
325
+ assert self.repo_url is not None
326
+
327
+ self.branch = branch or settings.git.default_branch
328
+ self.cache_dir = Path(cache_dir or settings.git.cache_dir)
329
+ self.ssh_key_path = settings.git.ssh_key_path
330
+ self.known_hosts_path = settings.git.known_hosts_path
331
+ self.shallow = settings.git.shallow_clone
332
+
333
+ # Create cache directory if it doesn't exist
334
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
335
+
336
+ # Compute repo hash for cache key
337
+ self.repo_hash = hashlib.sha256(self.repo_url.encode()).hexdigest()[:16]
338
+
339
+ logger.debug(
340
+ f"Initialized GitProvider: repo={self.repo_url}, "
341
+ f"branch={self.branch}, cache={self.cache_dir}"
342
+ )
343
+
344
+ def _get_cached_repo_path(self, ref: str | None = None) -> Path:
345
+ """
346
+ Get local path for cached repository.
347
+
348
+ Args:
349
+ ref: Git ref (branch, tag, or commit). Uses default branch if None.
350
+
351
+ Returns:
352
+ Path to local cached repository
353
+
354
+ Examples:
355
+ >>> provider._get_cached_repo_path()
356
+ Path('/tmp/rem-git-cache/a1b2c3d4e5f6/main')
357
+ >>> provider._get_cached_repo_path('v1.0.0')
358
+ Path('/tmp/rem-git-cache/a1b2c3d4e5f6/v1.0.0')
359
+ """
360
+ ref = ref or self.branch
361
+ return self.cache_dir / self.repo_hash / ref
362
+
363
+ def _setup_git_ssh(self) -> dict[str, str]:
364
+ """
365
+ Configure Git SSH authentication via environment variables.
366
+
367
+ Sets GIT_SSH_COMMAND to use custom SSH key and known_hosts file.
368
+ This approach works with GitPython's subprocess calls.
369
+
370
+ Returns:
371
+ Environment variables dict for Git commands
372
+
373
+ Raises:
374
+ FileNotFoundError: If SSH key or known_hosts file doesn't exist
375
+
376
+ Examples:
377
+ >>> env = provider._setup_git_ssh()
378
+ >>> env['GIT_SSH_COMMAND']
379
+ 'ssh -i /etc/git-secret/ssh -o UserKnownHostsFile=/etc/git-secret/known_hosts -o StrictHostKeyChecking=yes'
380
+ """
381
+ env = os.environ.copy()
382
+
383
+ # Check if SSH key exists
384
+ if Path(self.ssh_key_path).exists():
385
+ ssh_command = (
386
+ f"ssh -i {self.ssh_key_path} "
387
+ f"-o UserKnownHostsFile={self.known_hosts_path} "
388
+ f"-o StrictHostKeyChecking=yes"
389
+ )
390
+ env["GIT_SSH_COMMAND"] = ssh_command
391
+ logger.debug(f"Configured Git SSH: key={self.ssh_key_path}")
392
+ else:
393
+ logger.warning(
394
+ f"SSH key not found at {self.ssh_key_path}. "
395
+ "Falling back to default Git authentication."
396
+ )
397
+
398
+ return env
399
+
400
+ def _setup_git_https(self, repo_url: str) -> str:
401
+ """
402
+ Configure HTTPS authentication with Personal Access Token.
403
+
404
+ Injects PAT into HTTPS URL for authentication:
405
+ https://github.com/org/repo.git → https://{token}@github.com/org/repo.git
406
+
407
+ Args:
408
+ repo_url: Original HTTPS repository URL
409
+
410
+ Returns:
411
+ Modified URL with embedded PAT
412
+
413
+ Security Note:
414
+ PAT is not logged or exposed in error messages.
415
+
416
+ Examples:
417
+ >>> provider._setup_git_https("https://github.com/org/repo.git")
418
+ 'https://ghp_token123@github.com/org/repo.git'
419
+ """
420
+ token = settings.git.personal_access_token
421
+ if not token:
422
+ logger.warning("No Personal Access Token configured for HTTPS authentication.")
423
+ return repo_url
424
+
425
+ # Parse URL and inject token
426
+ parsed = urlparse(repo_url)
427
+ if parsed.scheme in ("https", "http"):
428
+ # https://github.com/org/repo.git → https://token@github.com/org/repo.git
429
+ authed_url = f"{parsed.scheme}://{token}@{parsed.netloc}{parsed.path}"
430
+ logger.debug("Configured Git HTTPS authentication with PAT")
431
+ return authed_url
432
+
433
+ return repo_url
434
+
435
+ def _clone_or_update(self, ref: str | None = None) -> Repo:
436
+ """
437
+ Clone repository or update existing clone.
438
+
439
+ Handles both initial cloning and updating existing repositories.
440
+ Uses shallow clones (--depth=1) for performance when enabled.
441
+
442
+ Args:
443
+ ref: Git ref to checkout (branch, tag, or commit)
444
+
445
+ Returns:
446
+ GitPython Repo object
447
+
448
+ Raises:
449
+ GitCommandError: If clone/checkout fails
450
+ FileNotFoundError: If SSH key is required but not found
451
+
452
+ Workflow:
453
+ 1. Check if repo already cloned
454
+ 2. If exists: fetch + checkout ref
455
+ 3. If not: clone with ref
456
+ 4. Return Repo object
457
+
458
+ Examples:
459
+ >>> repo = provider._clone_or_update("v1.0.0")
460
+ >>> repo.head.commit
461
+ <git.Commit "abc123...">
462
+ """
463
+ repo_path = self._get_cached_repo_path(ref)
464
+ ref = ref or self.branch
465
+
466
+ # Setup authentication
467
+ env = self._setup_git_ssh()
468
+ repo_url = self.repo_url
469
+
470
+ if not repo_url:
471
+ raise ValueError("repo_url is required for cloning")
472
+
473
+ # If HTTPS, inject PAT
474
+ if repo_url.startswith("https://") or repo_url.startswith("http://"):
475
+ repo_url = self._setup_git_https(repo_url)
476
+
477
+ try:
478
+ if repo_path.exists():
479
+ # Repository already cloned, update it
480
+ logger.debug(f"Updating existing repo at {repo_path}")
481
+ repo = Repo(repo_path)
482
+
483
+ # Fetch latest changes
484
+ repo.remotes.origin.fetch(env=env)
485
+
486
+ # Checkout requested ref
487
+ try:
488
+ repo.git.checkout(ref, env=env)
489
+ logger.info(f"Checked out ref: {ref}")
490
+ except GitCommandError as e:
491
+ logger.warning(
492
+ f"Failed to checkout ref '{ref}': {e}. "
493
+ f"Falling back to default branch '{self.branch}'"
494
+ )
495
+ repo.git.checkout(self.branch, env=env)
496
+
497
+ return repo
498
+ else:
499
+ # Clone repository
500
+ logger.info(f"Cloning repo {self.repo_url} to {repo_path}")
501
+ repo_path.mkdir(parents=True, exist_ok=True)
502
+
503
+ # Clone with explicit arguments (mypy-safe)
504
+ if self.shallow:
505
+ logger.debug("Using shallow clone (--depth=1)")
506
+ repo = Repo.clone_from(
507
+ repo_url,
508
+ to_path=str(repo_path),
509
+ branch=ref,
510
+ env=env,
511
+ depth=1,
512
+ )
513
+ else:
514
+ repo = Repo.clone_from(
515
+ repo_url,
516
+ to_path=str(repo_path),
517
+ branch=ref,
518
+ env=env,
519
+ )
520
+ logger.info(f"Successfully cloned repo to {repo_path}")
521
+
522
+ return repo
523
+
524
+ except GitCommandError as e:
525
+ logger.error(f"Git operation failed: {e}")
526
+ raise
527
+
528
+ def _get_local_path(self, path: str, ref: str | None = None) -> Path:
529
+ """
530
+ Get local filesystem path for a Git repository file.
531
+
532
+ Clones repo if needed, then returns path to file in cached repo.
533
+
534
+ Args:
535
+ path: Path within repository (e.g., "schemas/agent.yaml")
536
+ ref: Git ref to checkout
537
+
538
+ Returns:
539
+ Local filesystem path to file
540
+
541
+ Raises:
542
+ FileNotFoundError: If file doesn't exist in repo
543
+
544
+ Examples:
545
+ >>> provider._get_local_path("schemas/agent.yaml", "v1.0.0")
546
+ Path('/tmp/rem-git-cache/a1b2c3d4/v1.0.0/schemas/agent.yaml')
547
+ """
548
+ repo = self._clone_or_update(ref)
549
+ local_path = Path(repo.working_dir) / path
550
+
551
+ if not local_path.exists():
552
+ raise FileNotFoundError(
553
+ f"Path '{path}' not found in Git repository at ref '{ref or self.branch}'"
554
+ )
555
+
556
+ return local_path
557
+
558
+ def exists(self, uri: str) -> bool:
559
+ """
560
+ Check if file or directory exists in Git repository.
561
+
562
+ Args:
563
+ uri: Git URI (git://path/to/file.yaml?ref=tag)
564
+
565
+ Returns:
566
+ True if path exists in repo, False otherwise
567
+
568
+ Examples:
569
+ >>> provider.exists("git://schemas/agent.yaml")
570
+ True
571
+ >>> provider.exists("git://schemas/agent.yaml?ref=v1.0.0")
572
+ True
573
+ >>> provider.exists("git://nonexistent.yaml")
574
+ False
575
+ """
576
+ path, ref = parse_git_uri(uri)
577
+
578
+ try:
579
+ local_path = self._get_local_path(path, ref)
580
+ return local_path.exists()
581
+ except (FileNotFoundError, GitCommandError):
582
+ return False
583
+
584
+ def read(self, uri: str, **options) -> Any:
585
+ """
586
+ Read file from Git repository.
587
+
588
+ Supports same format detection as LocalProvider:
589
+ - YAML: .yaml, .yml
590
+ - JSON: .json
591
+ - CSV: .csv
592
+ - Text: .txt, .md
593
+ - Binary: .pdf, .docx, .png, etc.
594
+
595
+ Args:
596
+ uri: Git URI (git://path/to/file.yaml?ref=tag)
597
+ **options: Format-specific read options
598
+
599
+ Returns:
600
+ Parsed file content
601
+
602
+ Raises:
603
+ FileNotFoundError: If file doesn't exist in repo
604
+ ValueError: If file format is unsupported
605
+
606
+ Examples:
607
+ >>> schema = provider.read("git://schemas/agent.yaml")
608
+ >>> data = provider.read("git://experiments/data.csv")
609
+ >>> image = provider.read("git://assets/logo.png")
610
+ """
611
+ path, ref = parse_git_uri(uri)
612
+ local_path = self._get_local_path(path, ref)
613
+
614
+ # Delegate to LocalProvider for format handling
615
+ from rem.services.fs.local_provider import LocalProvider
616
+
617
+ local_provider = LocalProvider()
618
+ return local_provider.read(str(local_path), **options)
619
+
620
+ def ls(self, uri: str, **options) -> list[str]:
621
+ """
622
+ List files in Git repository directory.
623
+
624
+ Args:
625
+ uri: Git URI (git://path/to/dir/?ref=tag)
626
+ **options: Provider options
627
+
628
+ Returns:
629
+ List of file paths (relative to repo root)
630
+
631
+ Examples:
632
+ >>> provider.ls("git://schemas/")
633
+ ['schemas/agent-1.yaml', 'schemas/agent-2.yaml']
634
+ >>> provider.ls("git://experiments/hello-world/?ref=v1.0.0")
635
+ ['experiments/hello-world/ground_truth.csv', 'experiments/hello-world/config.yaml']
636
+ """
637
+ path, ref = parse_git_uri(uri)
638
+ local_path = self._get_local_path(path, ref)
639
+
640
+ if not local_path.is_dir():
641
+ raise ValueError(f"Path '{path}' is not a directory")
642
+
643
+ # List all files recursively
644
+ files = []
645
+ repo_root = self._get_cached_repo_path(ref)
646
+
647
+ for file_path in local_path.rglob("*"):
648
+ if file_path.is_file():
649
+ # Make path relative to repo root
650
+ relative = file_path.relative_to(repo_root)
651
+ files.append(str(relative))
652
+
653
+ return sorted(files)
654
+
655
+ def ls_iter(self, uri: str, **options) -> Iterator[str]:
656
+ """
657
+ Iterate over files in Git repository directory.
658
+
659
+ Args:
660
+ uri: Git URI (git://path/to/dir/?ref=tag)
661
+ **options: Provider options
662
+
663
+ Yields:
664
+ File paths (relative to repo root)
665
+
666
+ Examples:
667
+ >>> for file in provider.ls_iter("git://schemas/"):
668
+ ... print(file)
669
+ schemas/agent-1.yaml
670
+ schemas/agent-2.yaml
671
+ """
672
+ for file_path in self.ls(uri, **options):
673
+ yield file_path
674
+
675
+ def clear_cache(self, ref: str | None = None):
676
+ """
677
+ Clear cached repository.
678
+
679
+ Useful for:
680
+ - Forcing fresh clone
681
+ - Freeing disk space
682
+ - Testing
683
+
684
+ Args:
685
+ ref: Specific ref to clear, or None to clear all refs
686
+
687
+ Examples:
688
+ >>> provider.clear_cache("v1.0.0") # Clear specific tag
689
+ >>> provider.clear_cache() # Clear all refs
690
+ """
691
+ if ref:
692
+ repo_path = self._get_cached_repo_path(ref)
693
+ if repo_path.exists():
694
+ shutil.rmtree(repo_path)
695
+ logger.info(f"Cleared cache for ref: {ref}")
696
+ else:
697
+ repo_base = self.cache_dir / self.repo_hash
698
+ if repo_base.exists():
699
+ shutil.rmtree(repo_base)
700
+ logger.info(f"Cleared all cached refs for repo: {self.repo_url}")
701
+
702
+ def get_current_commit(self, ref: str | None = None) -> str:
703
+ """
704
+ Get current commit hash for ref.
705
+
706
+ Useful for tracking which version of schema is currently loaded.
707
+
708
+ Args:
709
+ ref: Git ref (branch, tag, or commit)
710
+
711
+ Returns:
712
+ Full commit hash (40 characters)
713
+
714
+ Examples:
715
+ >>> provider.get_current_commit("v1.0.0")
716
+ 'abc123def456...'
717
+ >>> provider.get_current_commit() # Current branch
718
+ 'def456abc123...'
719
+ """
720
+ repo = self._clone_or_update(ref)
721
+ return repo.head.commit.hexsha
722
+
723
+ def get_semantic_versions(self, file_path: str, pattern: str | None = None) -> list[dict[str, Any]]:
724
+ """
725
+ Get semantic version history for a file following Git tags.
726
+
727
+ Returns list of versions where the file exists, sorted by semantic versioning.
728
+ Useful for tracking schema evolution, comparing agent versions, and
729
+ understanding when changes were introduced.
730
+
731
+ **Semantic Versioning** (semver.org):
732
+ - Format: MAJOR.MINOR.PATCH (e.g., 2.1.0, 2.1.1, 3.0.0)
733
+ - MAJOR: Breaking changes
734
+ - MINOR: New features (backwards compatible)
735
+ - PATCH: Bug fixes (backwards compatible)
736
+
737
+ **Use Cases**:
738
+ 1. **Schema Evolution Tracking**:
739
+ - Compare cv-parser v2.1.0 vs v2.1.1
740
+ - Identify breaking changes (MAJOR version bumps)
741
+ - Review feature additions (MINOR version bumps)
742
+
743
+ 2. **Rollback/Pinning**:
744
+ - Production uses v2.1.0 (stable)
745
+ - Staging tests v2.1.1 (latest)
746
+ - Can rollback to v2.0.0 if needed
747
+
748
+ 3. **Deprecation Management**:
749
+ - Mark v1.x.x as deprecated
750
+ - Migrate users to v2.x.x
751
+ - Track adoption rate by version
752
+
753
+ Args:
754
+ file_path: Path to file in repository (e.g., "schemas/agent.yaml")
755
+ pattern: Optional regex pattern for tag filtering (e.g., "v2\\..*" for v2.x.x)
756
+
757
+ Returns:
758
+ List of version dicts sorted by semantic version (newest first):
759
+ [
760
+ {
761
+ "tag": "v2.1.1",
762
+ "version": (2, 1, 1),
763
+ "commit": "abc123...",
764
+ "date": "2025-01-15T10:30:00",
765
+ "message": "feat: Add confidence scoring",
766
+ "author": "alice@example.com"
767
+ },
768
+ {
769
+ "tag": "v2.1.0",
770
+ "version": (2, 1, 0),
771
+ "commit": "def456...",
772
+ "date": "2025-01-10T14:20:00",
773
+ "message": "feat: Add multi-language support",
774
+ "author": "bob@example.com"
775
+ }
776
+ ]
777
+
778
+ Raises:
779
+ FileNotFoundError: If file doesn't exist in any tagged version
780
+
781
+ Examples:
782
+ >>> # Get all versions of a schema
783
+ >>> versions = provider.get_semantic_versions("schemas/cv-parser.yaml")
784
+ >>> print(f"Current: {versions[0]['tag']}, Previous: {versions[1]['tag']}")
785
+ Current: v2.1.1, Previous: v2.1.0
786
+
787
+ >>> # Get only v2.x.x versions
788
+ >>> v2_versions = provider.get_semantic_versions(
789
+ ... "schemas/cv-parser.yaml",
790
+ ... pattern="v2\\..*"
791
+ ... )
792
+
793
+ >>> # Compare two versions
794
+ >>> v1 = provider.read(f"git://schemas/cv-parser.yaml?ref={versions[0]['tag']}")
795
+ >>> v2 = provider.read(f"git://schemas/cv-parser.yaml?ref={versions[1]['tag']}")
796
+ >>> # Diff logic here...
797
+
798
+ >>> # Find version by date
799
+ >>> target_date = "2025-01-12"
800
+ >>> version = next(v for v in versions if v["date"].startswith(target_date))
801
+ >>> print(version["tag"])
802
+ v2.1.0
803
+ """
804
+ import re
805
+ from datetime import datetime
806
+
807
+ repo = self._clone_or_update()
808
+
809
+ # Get all tags from repository
810
+ tags = repo.tags
811
+
812
+ if not tags:
813
+ logger.warning(f"No tags found in repository {self.repo_url}")
814
+ return []
815
+
816
+ versions = []
817
+ # Pattern supports both flat tags (v2.1.0) and path-based tags (schemas/test/v2.1.0)
818
+ semver_pattern = re.compile(r"(?:^|/)v?(\d+)\.(\d+)\.(\d+)")
819
+
820
+ for tag in tags:
821
+ tag_name = tag.name
822
+
823
+ # Apply user-provided pattern filter
824
+ if pattern and not re.search(pattern, tag_name):
825
+ continue
826
+
827
+ # Extract semantic version (MAJOR.MINOR.PATCH)
828
+ match = semver_pattern.search(tag_name)
829
+ if not match:
830
+ continue # Skip non-semver tags
831
+
832
+ major, minor, patch = map(int, match.groups())
833
+
834
+ # Check if file exists at this tag
835
+ try:
836
+ repo.git.checkout(tag_name)
837
+ full_path = Path(repo.working_dir) / file_path
838
+
839
+ if not full_path.exists():
840
+ continue # File doesn't exist in this version
841
+
842
+ # Get commit info for this tag
843
+ commit = tag.commit
844
+ commit_date = datetime.fromtimestamp(commit.committed_date)
845
+
846
+ versions.append({
847
+ "tag": tag_name,
848
+ "version": (major, minor, patch),
849
+ "commit": commit.hexsha,
850
+ "date": commit_date.isoformat(),
851
+ "message": commit.message.strip(),
852
+ "author": commit.author.email,
853
+ })
854
+
855
+ except (GitCommandError, FileNotFoundError):
856
+ continue
857
+
858
+ # Sort by semantic version (newest first)
859
+ versions.sort(key=lambda v: v["version"], reverse=True)
860
+
861
+ # Restore to default branch
862
+ repo.git.checkout(self.branch)
863
+
864
+ logger.info(
865
+ f"Found {len(versions)} semantic versions for {file_path} "
866
+ f"(pattern: {pattern or 'all'})"
867
+ )
868
+
869
+ return versions
870
+
871
+ def diff_versions(
872
+ self,
873
+ file_path: str,
874
+ version1: str,
875
+ version2: str,
876
+ unified: int = 3
877
+ ) -> str:
878
+ """
879
+ Generate unified diff between two versions of a file.
880
+
881
+ Useful for:
882
+ - Code review: What changed between v2.1.0 and v2.1.1?
883
+ - Migration planning: Breaking changes from v1.x.x to v2.x.x?
884
+ - Audit trail: Who changed what and when?
885
+
886
+ Args:
887
+ file_path: Path to file in repository
888
+ version1: First version tag (e.g., "v2.1.0")
889
+ version2: Second version tag (e.g., "v2.1.1")
890
+ unified: Number of context lines (default: 3)
891
+
892
+ Returns:
893
+ Unified diff string (Git format)
894
+
895
+ Examples:
896
+ >>> # Compare adjacent versions
897
+ >>> diff = provider.diff_versions(
898
+ ... "schemas/cv-parser.yaml",
899
+ ... "v2.1.0",
900
+ ... "v2.1.1"
901
+ ... )
902
+ >>> print(diff)
903
+ --- a/schemas/cv-parser.yaml
904
+ +++ b/schemas/cv-parser.yaml
905
+ @@ -10,6 +10,7 @@
906
+ skills:
907
+ type: array
908
+ + description: Candidate technical skills
909
+ experience:
910
+ type: array
911
+
912
+ >>> # Check for breaking changes
913
+ >>> if "required:" in diff and "-" in diff:
914
+ ... print("⚠️ Breaking change: Required field removed")
915
+ """
916
+ repo = self._clone_or_update()
917
+
918
+ try:
919
+ # Generate diff using git diff command
920
+ diff_output = repo.git.diff(
921
+ version1,
922
+ version2,
923
+ "--",
924
+ file_path,
925
+ unified=unified
926
+ )
927
+
928
+ return diff_output
929
+
930
+ except GitCommandError as e:
931
+ logger.error(f"Failed to generate diff: {e}")
932
+ raise ValueError(
933
+ f"Could not diff {file_path} between {version1} and {version2}. "
934
+ "Ensure both tags exist and file is present in both versions."
935
+ )