remdb 0.3.230__py3-none-any.whl → 0.3.258__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/agentic/__init__.py +10 -1
- rem/agentic/context.py +13 -2
- rem/agentic/context_builder.py +45 -34
- rem/agentic/providers/pydantic_ai.py +302 -110
- rem/api/mcp_router/resources.py +223 -0
- rem/api/mcp_router/tools.py +76 -10
- rem/api/routers/auth.py +113 -10
- rem/api/routers/chat/child_streaming.py +22 -8
- rem/api/routers/chat/completions.py +3 -3
- rem/api/routers/chat/sse_events.py +3 -3
- rem/api/routers/chat/streaming.py +40 -45
- rem/api/routers/chat/streaming_utils.py +5 -7
- rem/api/routers/feedback.py +2 -2
- rem/api/routers/query.py +5 -14
- rem/cli/commands/ask.py +144 -33
- rem/cli/commands/experiments.py +1 -1
- rem/cli/commands/process.py +9 -1
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/session.py +117 -0
- rem/cli/main.py +2 -0
- rem/models/core/experiment.py +1 -1
- rem/models/entities/session.py +1 -0
- rem/schemas/agents/core/agent-builder.yaml +1 -1
- rem/schemas/agents/test_orchestrator.yaml +42 -0
- rem/schemas/agents/test_structured_output.yaml +52 -0
- rem/services/content/providers.py +151 -49
- rem/services/postgres/repository.py +1 -0
- rem/services/rem/README.md +4 -3
- rem/services/rem/parser.py +7 -10
- rem/services/rem/service.py +47 -0
- rem/services/session/compression.py +7 -3
- rem/services/session/pydantic_messages.py +25 -7
- rem/services/session/reload.py +2 -1
- rem/settings.py +64 -7
- rem/sql/migrations/004_cache_system.sql +3 -1
- rem/utils/schema_loader.py +135 -103
- {remdb-0.3.230.dist-info → remdb-0.3.258.dist-info}/METADATA +6 -5
- {remdb-0.3.230.dist-info → remdb-0.3.258.dist-info}/RECORD +40 -37
- {remdb-0.3.230.dist-info → remdb-0.3.258.dist-info}/WHEEL +0 -0
- {remdb-0.3.230.dist-info → remdb-0.3.258.dist-info}/entry_points.txt +0 -0
rem/cli/commands/session.py
CHANGED
|
@@ -331,6 +331,123 @@ async def _show_async(
|
|
|
331
331
|
raise
|
|
332
332
|
|
|
333
333
|
|
|
334
|
+
@session.command("clone")
|
|
335
|
+
@click.argument("session_id")
|
|
336
|
+
@click.option("--to-turn", "-t", type=int, help="Clone up to turn N (counting user messages only)")
|
|
337
|
+
@click.option("--name", "-n", help="Name/description for the cloned session")
|
|
338
|
+
def clone(session_id: str, to_turn: int | None, name: str | None):
|
|
339
|
+
"""
|
|
340
|
+
Clone a session for exploring alternate conversation paths.
|
|
341
|
+
|
|
342
|
+
SESSION_ID: The session ID to clone.
|
|
343
|
+
|
|
344
|
+
Examples:
|
|
345
|
+
|
|
346
|
+
# Clone entire session
|
|
347
|
+
rem session clone 810f1f2d-d5a1-4c02-83b6-67040b47f7c0
|
|
348
|
+
|
|
349
|
+
# Clone up to turn 3 (first 3 user messages and their responses)
|
|
350
|
+
rem session clone 810f1f2d-d5a1-4c02-83b6-67040b47f7c0 --to-turn 3
|
|
351
|
+
|
|
352
|
+
# Clone with a descriptive name
|
|
353
|
+
rem session clone 810f1f2d-d5a1-4c02-83b6-67040b47f7c0 -n "Alternate anxiety path"
|
|
354
|
+
"""
|
|
355
|
+
asyncio.run(_clone_async(session_id, to_turn, name))
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
async def _clone_async(
|
|
359
|
+
session_id: str,
|
|
360
|
+
to_turn: int | None,
|
|
361
|
+
name: str | None,
|
|
362
|
+
):
|
|
363
|
+
"""Async implementation of clone command."""
|
|
364
|
+
from uuid import uuid4
|
|
365
|
+
from ...models.entities.session import Session, SessionMode
|
|
366
|
+
|
|
367
|
+
pg = get_postgres_service()
|
|
368
|
+
if not pg:
|
|
369
|
+
logger.error("PostgreSQL not available")
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
await pg.connect()
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
# Load original session messages
|
|
376
|
+
message_repo = Repository(Message, "messages", db=pg)
|
|
377
|
+
messages = await message_repo.find(
|
|
378
|
+
filters={"session_id": session_id},
|
|
379
|
+
order_by="created_at ASC",
|
|
380
|
+
limit=1000,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
if not messages:
|
|
384
|
+
logger.error(f"No messages found for session {session_id}")
|
|
385
|
+
return
|
|
386
|
+
|
|
387
|
+
# If --to-turn specified, filter messages up to that turn (user messages)
|
|
388
|
+
if to_turn is not None:
|
|
389
|
+
user_count = 0
|
|
390
|
+
cutoff_idx = len(messages)
|
|
391
|
+
for idx, msg in enumerate(messages):
|
|
392
|
+
if msg.message_type == "user":
|
|
393
|
+
user_count += 1
|
|
394
|
+
if user_count > to_turn:
|
|
395
|
+
cutoff_idx = idx
|
|
396
|
+
break
|
|
397
|
+
messages = messages[:cutoff_idx]
|
|
398
|
+
logger.info(f"Cloning {len(messages)} messages (up to turn {to_turn})")
|
|
399
|
+
else:
|
|
400
|
+
logger.info(f"Cloning all {len(messages)} messages")
|
|
401
|
+
|
|
402
|
+
# Generate new session ID
|
|
403
|
+
new_session_id = str(uuid4())
|
|
404
|
+
|
|
405
|
+
# Get user_id and tenant_id from first message
|
|
406
|
+
first_msg = messages[0]
|
|
407
|
+
user_id = first_msg.user_id
|
|
408
|
+
tenant_id = first_msg.tenant_id or "default"
|
|
409
|
+
|
|
410
|
+
# Create Session record with CLONE mode and lineage
|
|
411
|
+
session_repo = Repository(Session, "sessions", db=pg)
|
|
412
|
+
new_session = Session(
|
|
413
|
+
id=uuid4(),
|
|
414
|
+
name=name or f"Clone of {session_id[:8]}",
|
|
415
|
+
mode=SessionMode.CLONE,
|
|
416
|
+
original_trace_id=session_id,
|
|
417
|
+
description=f"Cloned from session {session_id}" + (f" at turn {to_turn}" if to_turn else ""),
|
|
418
|
+
user_id=user_id,
|
|
419
|
+
tenant_id=tenant_id,
|
|
420
|
+
message_count=len(messages),
|
|
421
|
+
)
|
|
422
|
+
await session_repo.upsert(new_session)
|
|
423
|
+
logger.info(f"Created session record: {new_session.id}")
|
|
424
|
+
|
|
425
|
+
# Copy messages with new session_id
|
|
426
|
+
for msg in messages:
|
|
427
|
+
new_msg = Message(
|
|
428
|
+
id=uuid4(),
|
|
429
|
+
user_id=msg.user_id,
|
|
430
|
+
tenant_id=msg.tenant_id,
|
|
431
|
+
session_id=str(new_session.id),
|
|
432
|
+
content=msg.content,
|
|
433
|
+
message_type=msg.message_type,
|
|
434
|
+
metadata=msg.metadata,
|
|
435
|
+
)
|
|
436
|
+
await message_repo.upsert(new_msg)
|
|
437
|
+
|
|
438
|
+
click.echo(f"\n✅ Cloned session successfully!")
|
|
439
|
+
click.echo(f" Original: {session_id}")
|
|
440
|
+
click.echo(f" New: {new_session.id}")
|
|
441
|
+
click.echo(f" Messages: {len(messages)}")
|
|
442
|
+
if to_turn:
|
|
443
|
+
click.echo(f" Turns: {to_turn}")
|
|
444
|
+
click.echo(f"\nContinue this session with:")
|
|
445
|
+
click.echo(f" rem ask <agent> \"your message\" --session-id {new_session.id}")
|
|
446
|
+
|
|
447
|
+
finally:
|
|
448
|
+
await pg.disconnect()
|
|
449
|
+
|
|
450
|
+
|
|
334
451
|
def register_command(cli_group):
|
|
335
452
|
"""Register the session command group."""
|
|
336
453
|
cli_group.add_command(session)
|
rem/cli/main.py
CHANGED
|
@@ -97,6 +97,7 @@ from .commands.mcp import register_command as register_mcp_command
|
|
|
97
97
|
from .commands.scaffold import scaffold as scaffold_command
|
|
98
98
|
from .commands.cluster import register_commands as register_cluster_commands
|
|
99
99
|
from .commands.session import register_command as register_session_command
|
|
100
|
+
from .commands.query import register_command as register_query_command
|
|
100
101
|
|
|
101
102
|
register_schema_commands(schema)
|
|
102
103
|
register_db_commands(db)
|
|
@@ -107,6 +108,7 @@ register_ask_command(cli)
|
|
|
107
108
|
register_configure_command(cli)
|
|
108
109
|
register_serve_command(cli)
|
|
109
110
|
register_mcp_command(cli)
|
|
111
|
+
register_query_command(cli)
|
|
110
112
|
cli.add_command(experiments_group)
|
|
111
113
|
cli.add_command(scaffold_command)
|
|
112
114
|
register_session_command(cli)
|
rem/models/core/experiment.py
CHANGED
|
@@ -461,7 +461,7 @@ class ExperimentConfig(BaseModel):
|
|
|
461
461
|
"""
|
|
462
462
|
Get the evaluator filename with task prefix.
|
|
463
463
|
|
|
464
|
-
Returns: {agent_name}-{task}.yaml (e.g.,
|
|
464
|
+
Returns: {agent_name}-{task}.yaml (e.g., rem-risk-assessment.yaml)
|
|
465
465
|
"""
|
|
466
466
|
return f"{self.agent_schema_ref.name}-{self.task}.yaml"
|
|
467
467
|
|
rem/models/entities/session.py
CHANGED
|
@@ -229,7 +229,7 @@ json_schema_extra:
|
|
|
229
229
|
- uri: rem://agents
|
|
230
230
|
description: "List all available agent schemas with descriptions"
|
|
231
231
|
- uri: rem://agents/{agent_name}
|
|
232
|
-
description: "Load a specific agent schema by name (e.g., 'rem', '
|
|
232
|
+
description: "Load a specific agent schema by name (e.g., 'rem', 'intake')"
|
|
233
233
|
tools:
|
|
234
234
|
- name: save_agent
|
|
235
235
|
description: "Save the agent schema. Only call when user approves the preview in Step 6."
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# TEST ORCHESTRATOR AGENT
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Parent agent that delegates to test_structured_output for testing
|
|
5
|
+
# the structured output persistence feature.
|
|
6
|
+
# =============================================================================
|
|
7
|
+
|
|
8
|
+
name: test_orchestrator
|
|
9
|
+
version: "1.0"
|
|
10
|
+
description: |
|
|
11
|
+
You are an orchestrator that helps analyze user messages.
|
|
12
|
+
|
|
13
|
+
When the user provides a message to analyze, you MUST:
|
|
14
|
+
1. Call the ask_agent tool to delegate to "test_structured_output"
|
|
15
|
+
2. Return the structured result to the user
|
|
16
|
+
|
|
17
|
+
## CRITICAL RULES
|
|
18
|
+
|
|
19
|
+
- ALWAYS call ask_agent with agent_name="test_structured_output"
|
|
20
|
+
- Pass the user's message as input_text
|
|
21
|
+
- Report back the structured result you receive
|
|
22
|
+
|
|
23
|
+
type: object
|
|
24
|
+
properties:
|
|
25
|
+
answer:
|
|
26
|
+
type: string
|
|
27
|
+
description: Response to the user
|
|
28
|
+
|
|
29
|
+
required:
|
|
30
|
+
- answer
|
|
31
|
+
|
|
32
|
+
json_schema_extra:
|
|
33
|
+
kind: agent
|
|
34
|
+
name: test_orchestrator
|
|
35
|
+
version: "1.0.0"
|
|
36
|
+
tags: [test, orchestrator]
|
|
37
|
+
tools:
|
|
38
|
+
- name: ask_agent
|
|
39
|
+
description: |
|
|
40
|
+
Delegate to the test_structured_output agent to analyze the message.
|
|
41
|
+
Always use agent_name="test_structured_output".
|
|
42
|
+
resources: []
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# TEST STRUCTURED OUTPUT AGENT
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Simple agent for testing structured_output: true functionality
|
|
5
|
+
# =============================================================================
|
|
6
|
+
|
|
7
|
+
name: test_structured_output
|
|
8
|
+
version: "1.0"
|
|
9
|
+
description: |
|
|
10
|
+
You are a test agent that produces structured output.
|
|
11
|
+
|
|
12
|
+
Your ONLY job is to return a structured response matching the schema below.
|
|
13
|
+
|
|
14
|
+
Based on the user's input, extract:
|
|
15
|
+
- summary: A brief summary of what they said
|
|
16
|
+
- sentiment: positive, negative, or neutral
|
|
17
|
+
- keywords: List of key words from their message
|
|
18
|
+
|
|
19
|
+
DO NOT ask questions. Just produce the structured output.
|
|
20
|
+
|
|
21
|
+
type: object
|
|
22
|
+
properties:
|
|
23
|
+
result:
|
|
24
|
+
type: object
|
|
25
|
+
description: Structured analysis result
|
|
26
|
+
properties:
|
|
27
|
+
summary:
|
|
28
|
+
type: string
|
|
29
|
+
description: Brief summary of the input
|
|
30
|
+
sentiment:
|
|
31
|
+
type: string
|
|
32
|
+
enum: [positive, negative, neutral]
|
|
33
|
+
description: Overall sentiment
|
|
34
|
+
keywords:
|
|
35
|
+
type: array
|
|
36
|
+
items:
|
|
37
|
+
type: string
|
|
38
|
+
description: Key words extracted from input
|
|
39
|
+
required: [summary, sentiment, keywords]
|
|
40
|
+
additionalProperties: false
|
|
41
|
+
|
|
42
|
+
required:
|
|
43
|
+
- result
|
|
44
|
+
|
|
45
|
+
json_schema_extra:
|
|
46
|
+
kind: agent
|
|
47
|
+
name: test_structured_output
|
|
48
|
+
version: "1.0.0"
|
|
49
|
+
tags: [test, structured-output]
|
|
50
|
+
structured_output: true
|
|
51
|
+
tools: []
|
|
52
|
+
resources: []
|
|
@@ -118,15 +118,40 @@ class DocProvider(ContentProvider):
|
|
|
118
118
|
- Images (.png, .jpg) - OCR text extraction
|
|
119
119
|
|
|
120
120
|
Handles:
|
|
121
|
-
- Text extraction with OCR fallback
|
|
121
|
+
- Text extraction with automatic OCR fallback for scanned documents
|
|
122
122
|
- Table detection and extraction
|
|
123
123
|
- Daemon process workaround for multiprocessing restrictions
|
|
124
|
+
|
|
125
|
+
Environment Variables:
|
|
126
|
+
EXTRACTION_OCR_FALLBACK: Enable OCR fallback (default: true)
|
|
127
|
+
EXTRACTION_OCR_THRESHOLD: Min chars before triggering OCR fallback (default: 100)
|
|
128
|
+
EXTRACTION_FORCE_OCR: Always use OCR, skip native extraction (default: false)
|
|
129
|
+
EXTRACTION_OCR_LANGUAGE: Tesseract language codes (default: eng)
|
|
124
130
|
"""
|
|
125
131
|
|
|
126
132
|
@property
|
|
127
133
|
def name(self) -> str:
|
|
128
134
|
return "doc"
|
|
129
135
|
|
|
136
|
+
def _get_env_bool(self, key: str, default: bool) -> bool:
|
|
137
|
+
"""Get boolean from environment variable."""
|
|
138
|
+
import os
|
|
139
|
+
val = os.environ.get(key, "").lower()
|
|
140
|
+
if val in ("true", "1", "yes"):
|
|
141
|
+
return True
|
|
142
|
+
elif val in ("false", "0", "no"):
|
|
143
|
+
return False
|
|
144
|
+
return default
|
|
145
|
+
|
|
146
|
+
def _get_env_int(self, key: str, default: int) -> int:
|
|
147
|
+
"""Get integer from environment variable."""
|
|
148
|
+
import os
|
|
149
|
+
val = os.environ.get(key, "")
|
|
150
|
+
try:
|
|
151
|
+
return int(val) if val else default
|
|
152
|
+
except ValueError:
|
|
153
|
+
return default
|
|
154
|
+
|
|
130
155
|
def _is_daemon_process(self) -> bool:
|
|
131
156
|
"""Check if running in a daemon process."""
|
|
132
157
|
try:
|
|
@@ -134,29 +159,34 @@ class DocProvider(ContentProvider):
|
|
|
134
159
|
except Exception:
|
|
135
160
|
return False
|
|
136
161
|
|
|
137
|
-
def _parse_in_subprocess(self, file_path: Path) -> dict:
|
|
162
|
+
def _parse_in_subprocess(self, file_path: Path, force_ocr: bool = False) -> dict:
|
|
138
163
|
"""Run kreuzberg in a separate subprocess to bypass daemon restrictions."""
|
|
139
|
-
|
|
164
|
+
import os
|
|
165
|
+
ocr_language = os.environ.get("EXTRACTION_OCR_LANGUAGE", "eng")
|
|
166
|
+
|
|
167
|
+
script = f"""
|
|
140
168
|
import json
|
|
141
169
|
import sys
|
|
142
170
|
from pathlib import Path
|
|
143
|
-
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
171
|
+
from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync
|
|
144
172
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
)
|
|
173
|
+
force_ocr = {force_ocr}
|
|
174
|
+
|
|
175
|
+
if force_ocr:
|
|
176
|
+
config = ExtractionConfig(
|
|
177
|
+
force_ocr=True,
|
|
178
|
+
ocr=OcrConfig(backend="tesseract", language="{ocr_language}")
|
|
179
|
+
)
|
|
180
|
+
else:
|
|
181
|
+
config = ExtractionConfig()
|
|
151
182
|
|
|
152
183
|
result = extract_file_sync(Path(sys.argv[1]), config=config)
|
|
153
184
|
|
|
154
|
-
|
|
155
|
-
output = {
|
|
185
|
+
output = {{
|
|
156
186
|
'content': result.content,
|
|
157
|
-
'tables': [
|
|
158
|
-
'metadata':
|
|
159
|
-
}
|
|
187
|
+
'tables': [],
|
|
188
|
+
'metadata': {{}}
|
|
189
|
+
}}
|
|
160
190
|
print(json.dumps(output))
|
|
161
191
|
"""
|
|
162
192
|
|
|
@@ -173,9 +203,41 @@ print(json.dumps(output))
|
|
|
173
203
|
|
|
174
204
|
return json.loads(result.stdout)
|
|
175
205
|
|
|
206
|
+
def _extract_with_config(self, tmp_path: Path, force_ocr: bool = False) -> tuple[str, dict]:
|
|
207
|
+
"""Extract content with optional OCR config."""
|
|
208
|
+
import os
|
|
209
|
+
from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync
|
|
210
|
+
|
|
211
|
+
ocr_language = os.environ.get("EXTRACTION_OCR_LANGUAGE", "eng")
|
|
212
|
+
|
|
213
|
+
if force_ocr:
|
|
214
|
+
config = ExtractionConfig(
|
|
215
|
+
force_ocr=True,
|
|
216
|
+
ocr=OcrConfig(backend="tesseract", language=ocr_language)
|
|
217
|
+
)
|
|
218
|
+
parser_name = "kreuzberg_ocr"
|
|
219
|
+
else:
|
|
220
|
+
config = ExtractionConfig()
|
|
221
|
+
parser_name = "kreuzberg"
|
|
222
|
+
|
|
223
|
+
result = extract_file_sync(tmp_path, config=config)
|
|
224
|
+
text = result.content
|
|
225
|
+
|
|
226
|
+
extraction_metadata = {
|
|
227
|
+
"parser": parser_name,
|
|
228
|
+
"file_extension": tmp_path.suffix,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return text, extraction_metadata
|
|
232
|
+
|
|
176
233
|
def extract(self, content: bytes, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
177
234
|
"""
|
|
178
|
-
Extract document content using Kreuzberg.
|
|
235
|
+
Extract document content using Kreuzberg with intelligent OCR fallback.
|
|
236
|
+
|
|
237
|
+
Process:
|
|
238
|
+
1. Try native text extraction first (fast, preserves structure)
|
|
239
|
+
2. If content is minimal (< threshold chars), retry with OCR
|
|
240
|
+
3. Use OCR result if it's better than native result
|
|
179
241
|
|
|
180
242
|
Args:
|
|
181
243
|
content: Document file bytes
|
|
@@ -184,49 +246,89 @@ print(json.dumps(output))
|
|
|
184
246
|
Returns:
|
|
185
247
|
dict with text and extraction metadata
|
|
186
248
|
"""
|
|
249
|
+
# Get OCR settings from environment
|
|
250
|
+
force_ocr = self._get_env_bool("EXTRACTION_FORCE_OCR", False)
|
|
251
|
+
ocr_fallback = self._get_env_bool("EXTRACTION_OCR_FALLBACK", True)
|
|
252
|
+
ocr_threshold = self._get_env_int("EXTRACTION_OCR_THRESHOLD", 100)
|
|
253
|
+
|
|
187
254
|
# Write bytes to temp file for kreuzberg
|
|
188
|
-
# Detect extension from metadata
|
|
189
255
|
content_type = metadata.get("content_type", "")
|
|
190
256
|
suffix = get_extension(content_type, default=".pdf")
|
|
191
257
|
|
|
192
258
|
with temp_file_from_bytes(content, suffix=suffix) as tmp_path:
|
|
259
|
+
ocr_used = False
|
|
260
|
+
ocr_fallback_triggered = False
|
|
261
|
+
native_char_count = 0
|
|
262
|
+
|
|
193
263
|
# Check if running in daemon process
|
|
194
264
|
if self._is_daemon_process():
|
|
195
|
-
logger.info("Daemon process detected - using subprocess workaround
|
|
265
|
+
logger.info("Daemon process detected - using subprocess workaround")
|
|
196
266
|
try:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
267
|
+
if force_ocr:
|
|
268
|
+
result_dict = self._parse_in_subprocess(tmp_path, force_ocr=True)
|
|
269
|
+
text = result_dict["content"]
|
|
270
|
+
ocr_used = True
|
|
271
|
+
extraction_metadata = {
|
|
272
|
+
"parser": "kreuzberg_subprocess_ocr",
|
|
273
|
+
"file_extension": tmp_path.suffix,
|
|
274
|
+
}
|
|
275
|
+
else:
|
|
276
|
+
# Try native first
|
|
277
|
+
result_dict = self._parse_in_subprocess(tmp_path, force_ocr=False)
|
|
278
|
+
text = result_dict["content"]
|
|
279
|
+
native_char_count = len(text)
|
|
280
|
+
|
|
281
|
+
# OCR fallback if content is minimal
|
|
282
|
+
if ocr_fallback and len(text.strip()) < ocr_threshold:
|
|
283
|
+
logger.warning(f"Content below threshold ({len(text.strip())} < {ocr_threshold}) - trying OCR fallback")
|
|
284
|
+
try:
|
|
285
|
+
ocr_result = self._parse_in_subprocess(tmp_path, force_ocr=True)
|
|
286
|
+
ocr_text = ocr_result["content"]
|
|
287
|
+
if len(ocr_text.strip()) > len(text.strip()):
|
|
288
|
+
logger.info(f"OCR fallback improved result: {len(ocr_text)} chars (was {native_char_count})")
|
|
289
|
+
text = ocr_text
|
|
290
|
+
ocr_used = True
|
|
291
|
+
ocr_fallback_triggered = True
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.warning(f"OCR fallback failed in subprocess: {e}")
|
|
294
|
+
|
|
295
|
+
extraction_metadata = {
|
|
296
|
+
"parser": "kreuzberg_subprocess" if not ocr_used else "kreuzberg_subprocess_ocr_fallback",
|
|
297
|
+
"file_extension": tmp_path.suffix,
|
|
298
|
+
}
|
|
204
299
|
except Exception as e:
|
|
205
|
-
logger.error(f"Subprocess parsing failed: {e}. Falling back to
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
config = ExtractionConfig(extract_tables=False)
|
|
209
|
-
result = extract_file_sync(tmp_path, config=config)
|
|
210
|
-
text = result.content
|
|
211
|
-
extraction_metadata = {
|
|
212
|
-
"parser": "kreuzberg_fallback",
|
|
213
|
-
"file_extension": tmp_path.suffix,
|
|
214
|
-
}
|
|
300
|
+
logger.error(f"Subprocess parsing failed: {e}. Falling back to direct call.")
|
|
301
|
+
text, extraction_metadata = self._extract_with_config(tmp_path, force_ocr=force_ocr)
|
|
302
|
+
ocr_used = force_ocr
|
|
215
303
|
else:
|
|
216
|
-
# Normal execution (not in daemon)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
304
|
+
# Normal execution (not in daemon)
|
|
305
|
+
if force_ocr:
|
|
306
|
+
text, extraction_metadata = self._extract_with_config(tmp_path, force_ocr=True)
|
|
307
|
+
ocr_used = True
|
|
308
|
+
else:
|
|
309
|
+
# Try native first
|
|
310
|
+
text, extraction_metadata = self._extract_with_config(tmp_path, force_ocr=False)
|
|
311
|
+
native_char_count = len(text)
|
|
312
|
+
|
|
313
|
+
# OCR fallback if content is minimal
|
|
314
|
+
if ocr_fallback and len(text.strip()) < ocr_threshold:
|
|
315
|
+
logger.warning(f"Content below threshold ({len(text.strip())} < {ocr_threshold}) - trying OCR fallback")
|
|
316
|
+
try:
|
|
317
|
+
ocr_text, _ = self._extract_with_config(tmp_path, force_ocr=True)
|
|
318
|
+
if len(ocr_text.strip()) > len(text.strip()):
|
|
319
|
+
logger.info(f"OCR fallback improved result: {len(ocr_text)} chars (was {native_char_count})")
|
|
320
|
+
text = ocr_text
|
|
321
|
+
ocr_used = True
|
|
322
|
+
ocr_fallback_triggered = True
|
|
323
|
+
extraction_metadata["parser"] = "kreuzberg_ocr_fallback"
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.warning(f"OCR fallback failed: {e}")
|
|
326
|
+
|
|
327
|
+
# Add OCR metadata
|
|
328
|
+
extraction_metadata["ocr_used"] = ocr_used
|
|
329
|
+
extraction_metadata["ocr_fallback_triggered"] = ocr_fallback_triggered
|
|
330
|
+
extraction_metadata["native_char_count"] = native_char_count
|
|
331
|
+
extraction_metadata["final_char_count"] = len(text)
|
|
230
332
|
|
|
231
333
|
return {
|
|
232
334
|
"text": text,
|
rem/services/rem/README.md
CHANGED
|
@@ -40,15 +40,16 @@ FuzzyQuery ::= FUZZY <text:string> [THRESHOLD <t:float>] [LIMIT <n:int>]
|
|
|
40
40
|
available : Stage 1+
|
|
41
41
|
example : FUZZY "sara" THRESHOLD 0.5 LIMIT 10
|
|
42
42
|
|
|
43
|
-
SearchQuery ::= SEARCH <text:string> [TABLE <table:string>] [WHERE <clause:string>] [LIMIT <n:int>]
|
|
43
|
+
SearchQuery ::= SEARCH <text:string> [IN|TABLE <table:string>] [WHERE <clause:string>] [LIMIT <n:int>]
|
|
44
44
|
text : Semantic query text
|
|
45
|
-
table : Target table (default: "resources")
|
|
45
|
+
table : Target table (default: "resources"). Use IN or TABLE keyword.
|
|
46
46
|
clause : Optional PostgreSQL WHERE clause for hybrid filtering (combines vector + structured)
|
|
47
47
|
limit : Max results (default: 10)
|
|
48
48
|
performance : Indexed (pgvector)
|
|
49
49
|
available : Stage 3+
|
|
50
50
|
examples :
|
|
51
|
-
- SEARCH "database migration"
|
|
51
|
+
- SEARCH "database migration" IN resources LIMIT 10
|
|
52
|
+
- SEARCH "parcel delivery" IN ontologies
|
|
52
53
|
- SEARCH "team discussion" TABLE moments WHERE "moment_type='meeting'" LIMIT 5
|
|
53
54
|
- SEARCH "project updates" WHERE "created_at >= '2024-01-01'" LIMIT 20
|
|
54
55
|
- SEARCH "AI research" WHERE "tags @> ARRAY['machine-learning']" LIMIT 10
|
rem/services/rem/parser.py
CHANGED
|
@@ -64,7 +64,7 @@ class RemQueryParser:
|
|
|
64
64
|
token_upper = token.upper()
|
|
65
65
|
|
|
66
66
|
# Handle REM keywords that take a value
|
|
67
|
-
if token_upper in ("LIMIT", "DEPTH", "THRESHOLD", "TYPE", "FROM", "WITH"):
|
|
67
|
+
if token_upper in ("LIMIT", "DEPTH", "THRESHOLD", "TYPE", "FROM", "WITH", "TABLE", "IN", "WHERE"):
|
|
68
68
|
if i + 1 < len(tokens):
|
|
69
69
|
keyword_map = {
|
|
70
70
|
"LIMIT": "limit",
|
|
@@ -73,6 +73,9 @@ class RemQueryParser:
|
|
|
73
73
|
"TYPE": "edge_types",
|
|
74
74
|
"FROM": "initial_query",
|
|
75
75
|
"WITH": "initial_query",
|
|
76
|
+
"TABLE": "table_name",
|
|
77
|
+
"IN": "table_name", # IN is alias for TABLE
|
|
78
|
+
"WHERE": "where_clause",
|
|
76
79
|
}
|
|
77
80
|
key = keyword_map[token_upper]
|
|
78
81
|
value = tokens[i + 1]
|
|
@@ -161,15 +164,9 @@ class RemQueryParser:
|
|
|
161
164
|
params["query_text"] = combined_value
|
|
162
165
|
|
|
163
166
|
elif query_type == QueryType.SEARCH:
|
|
164
|
-
# SEARCH expects: SEARCH <table> <
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
params["table_name"] = positional_args[0]
|
|
168
|
-
params["query_text"] = " ".join(positional_args[1:])
|
|
169
|
-
elif len(positional_args) == 1:
|
|
170
|
-
# Could be table name or query text - assume query text if no table
|
|
171
|
-
params["query_text"] = positional_args[0]
|
|
172
|
-
# If no positional args, params stays empty
|
|
167
|
+
# SEARCH expects: SEARCH <text> [TABLE <table>] [WHERE <clause>] [LIMIT n]
|
|
168
|
+
# All positional args are query_text, TABLE/WHERE/LIMIT are handled as keywords
|
|
169
|
+
params["query_text"] = combined_value
|
|
173
170
|
|
|
174
171
|
elif query_type == QueryType.TRAVERSE:
|
|
175
172
|
params["initial_query"] = combined_value
|
rem/services/rem/service.py
CHANGED
|
@@ -478,6 +478,53 @@ class RemService:
|
|
|
478
478
|
parser = RemQueryParser()
|
|
479
479
|
return parser.parse(query_string)
|
|
480
480
|
|
|
481
|
+
async def execute_query_string(
|
|
482
|
+
self, query_string: str, user_id: str | None = None
|
|
483
|
+
) -> dict[str, Any]:
|
|
484
|
+
"""
|
|
485
|
+
Execute a REM dialect query string directly.
|
|
486
|
+
|
|
487
|
+
This is the unified entry point for executing REM queries from both
|
|
488
|
+
the CLI and API. It handles parsing the query string, creating the
|
|
489
|
+
RemQuery model, and executing it.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
query_string: REM dialect query (e.g., 'LOOKUP "Sarah Chen"',
|
|
493
|
+
'SEARCH resources "API design"', 'SELECT * FROM users')
|
|
494
|
+
user_id: Optional user ID for query isolation
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Dict with query results and metadata:
|
|
498
|
+
- query_type: The type of query executed
|
|
499
|
+
- results: List of result rows
|
|
500
|
+
- count: Number of results
|
|
501
|
+
- Additional fields depending on query type
|
|
502
|
+
|
|
503
|
+
Raises:
|
|
504
|
+
ValueError: If the query string is invalid
|
|
505
|
+
QueryExecutionError: If query execution fails
|
|
506
|
+
|
|
507
|
+
Example:
|
|
508
|
+
>>> result = await rem_service.execute_query_string(
|
|
509
|
+
... 'LOOKUP "Sarah Chen"',
|
|
510
|
+
... user_id="user-123"
|
|
511
|
+
... )
|
|
512
|
+
>>> print(result["count"])
|
|
513
|
+
1
|
|
514
|
+
"""
|
|
515
|
+
# Parse the query string into type and parameters
|
|
516
|
+
query_type, parameters = self._parse_query_string(query_string)
|
|
517
|
+
|
|
518
|
+
# Create and validate the RemQuery model
|
|
519
|
+
rem_query = RemQuery.model_validate({
|
|
520
|
+
"query_type": query_type,
|
|
521
|
+
"parameters": parameters,
|
|
522
|
+
"user_id": user_id,
|
|
523
|
+
})
|
|
524
|
+
|
|
525
|
+
# Execute and return results
|
|
526
|
+
return await self.execute_query(rem_query)
|
|
527
|
+
|
|
481
528
|
async def ask_rem(
|
|
482
529
|
self, natural_query: str, tenant_id: str, llm_model: str | None = None, plan_mode: bool = False
|
|
483
530
|
) -> dict[str, Any]:
|