claude-self-reflect 3.2.0 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/claude-self-reflect-test.md +601 -556
- package/Dockerfile.async-importer +4 -1
- package/Dockerfile.importer +4 -1
- package/Dockerfile.mcp-server +4 -1
- package/Dockerfile.safe-watcher +4 -1
- package/Dockerfile.streaming-importer +5 -2
- package/Dockerfile.watcher +4 -1
- package/mcp-server/src/server.py +2 -2
- package/package.json +1 -1
- package/scripts/import-conversations-unified.py +182 -35
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
FROM python:3.13-slim
|
|
2
2
|
|
|
3
3
|
# SECURITY: CVE-2025-58050 mitigation - PCRE2 heap buffer overflow
|
|
4
|
-
#
|
|
4
|
+
# SECURITY: CVE-2025-7709 mitigation - SQLite3 vulnerability
|
|
5
|
+
# TODO: Remove explicit upgrades when base image includes patched versions
|
|
5
6
|
RUN apt-get update && \
|
|
6
7
|
(apt-get install -y --only-upgrade libpcre2-8-0 2>/dev/null || \
|
|
7
8
|
echo "Warning: PCRE2 10.46+ not yet available") && \
|
|
9
|
+
(apt-get install -y --only-upgrade libsqlite3-0 sqlite3 2>/dev/null || \
|
|
10
|
+
echo "Warning: SQLite3 patch not yet available") && \
|
|
8
11
|
apt-get upgrade -y && rm -rf /var/lib/apt/lists/*
|
|
9
12
|
|
|
10
13
|
# Install system dependencies
|
package/Dockerfile.importer
CHANGED
|
@@ -3,10 +3,13 @@ FROM python:3.13-slim
|
|
|
3
3
|
WORKDIR /app
|
|
4
4
|
|
|
5
5
|
# SECURITY: CVE-2025-58050 mitigation - PCRE2 heap buffer overflow
|
|
6
|
-
#
|
|
6
|
+
# SECURITY: CVE-2025-7709 mitigation - SQLite3 vulnerability
|
|
7
|
+
# TODO: Remove explicit upgrades when base image includes patched versions
|
|
7
8
|
RUN apt-get update && \
|
|
8
9
|
(apt-get install -y --only-upgrade libpcre2-8-0 2>/dev/null || \
|
|
9
10
|
echo "Warning: PCRE2 10.46+ not yet available") && \
|
|
11
|
+
(apt-get install -y --only-upgrade libsqlite3-0 sqlite3 2>/dev/null || \
|
|
12
|
+
echo "Warning: SQLite3 patch not yet available") && \
|
|
10
13
|
apt-get upgrade -y && rm -rf /var/lib/apt/lists/*
|
|
11
14
|
|
|
12
15
|
# Install dependencies directly (avoids file path issues with global npm installs)
|
package/Dockerfile.mcp-server
CHANGED
|
@@ -3,11 +3,14 @@ FROM python:3.13-slim
|
|
|
3
3
|
WORKDIR /app
|
|
4
4
|
|
|
5
5
|
# SECURITY: CVE-2025-58050 mitigation - PCRE2 heap buffer overflow
|
|
6
|
+
# SECURITY: CVE-2025-7709 mitigation - SQLite3 vulnerability
|
|
6
7
|
# Update system packages for security
|
|
7
|
-
# TODO: Remove explicit
|
|
8
|
+
# TODO: Remove explicit upgrades when base image includes patched versions
|
|
8
9
|
RUN apt-get update && \
|
|
9
10
|
(apt-get install -y --only-upgrade libpcre2-8-0 2>/dev/null || \
|
|
10
11
|
echo "Warning: PCRE2 10.46+ not yet available") && \
|
|
12
|
+
(apt-get install -y --only-upgrade libsqlite3-0 sqlite3 2>/dev/null || \
|
|
13
|
+
echo "Warning: SQLite3 patch not yet available") && \
|
|
11
14
|
apt-get upgrade -y && rm -rf /var/lib/apt/lists/*
|
|
12
15
|
|
|
13
16
|
# Copy the MCP server package files
|
package/Dockerfile.safe-watcher
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
FROM python:3.13-slim
|
|
2
2
|
|
|
3
3
|
# SECURITY: CVE-2025-58050 mitigation - PCRE2 heap buffer overflow
|
|
4
|
-
#
|
|
4
|
+
# SECURITY: CVE-2025-7709 mitigation - SQLite3 vulnerability
|
|
5
|
+
# TODO: Remove explicit upgrades when base image includes patched versions
|
|
5
6
|
RUN apt-get update && \
|
|
6
7
|
(apt-get install -y --only-upgrade libpcre2-8-0 2>/dev/null || \
|
|
7
8
|
echo "Warning: PCRE2 10.46+ not yet available") && \
|
|
9
|
+
(apt-get install -y --only-upgrade libsqlite3-0 sqlite3 2>/dev/null || \
|
|
10
|
+
echo "Warning: SQLite3 patch not yet available") && \
|
|
8
11
|
apt-get upgrade -y && rm -rf /var/lib/apt/lists/*
|
|
9
12
|
|
|
10
13
|
# Install system dependencies
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
FROM python:3.13-slim
|
|
2
2
|
|
|
3
3
|
# SECURITY: CVE-2025-58050 mitigation - PCRE2 heap buffer overflow
|
|
4
|
-
#
|
|
5
|
-
#
|
|
4
|
+
# SECURITY: CVE-2025-7709 mitigation - SQLite3 vulnerability
|
|
5
|
+
# Attempting explicit upgrade of vulnerable packages
|
|
6
|
+
# TODO: Remove explicit upgrades when base image includes patched versions
|
|
6
7
|
RUN apt-get update && \
|
|
7
8
|
(apt-get install -y --only-upgrade libpcre2-8-0 2>/dev/null || \
|
|
8
9
|
echo "Warning: PCRE2 10.46+ not yet available, continuing with security updates") && \
|
|
10
|
+
(apt-get install -y --only-upgrade libsqlite3-0 sqlite3 2>/dev/null || \
|
|
11
|
+
echo "Warning: SQLite3 patch not yet available, continuing with security updates") && \
|
|
9
12
|
apt-get upgrade -y && \
|
|
10
13
|
apt-get install -y --no-install-recommends \
|
|
11
14
|
gcc \
|
package/Dockerfile.watcher
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
FROM python:3.13-slim
|
|
2
2
|
|
|
3
3
|
# SECURITY: CVE-2025-58050 mitigation - PCRE2 heap buffer overflow
|
|
4
|
+
# SECURITY: CVE-2025-7709 mitigation - SQLite3 vulnerability
|
|
4
5
|
# Update system packages for security and install build dependencies for psutil
|
|
5
|
-
# TODO: Remove explicit
|
|
6
|
+
# TODO: Remove explicit upgrades when base image includes patched versions
|
|
6
7
|
RUN apt-get update && \
|
|
7
8
|
(apt-get install -y --only-upgrade libpcre2-8-0 2>/dev/null || \
|
|
8
9
|
echo "Warning: PCRE2 10.46+ not yet available") && \
|
|
10
|
+
(apt-get install -y --only-upgrade libsqlite3-0 sqlite3 2>/dev/null || \
|
|
11
|
+
echo "Warning: SQLite3 patch not yet available") && \
|
|
9
12
|
apt-get upgrade -y && \
|
|
10
13
|
apt-get install -y gcc python3-dev && \
|
|
11
14
|
rm -rf /var/lib/apt/lists/*
|
package/mcp-server/src/server.py
CHANGED
|
@@ -2065,8 +2065,8 @@ async def get_full_conversation(
|
|
|
2065
2065
|
search_dirs.extend([
|
|
2066
2066
|
base_path / project,
|
|
2067
2067
|
base_path / sanitized_project,
|
|
2068
|
-
base_path / f"-Users
|
|
2069
|
-
base_path / f"-Users
|
|
2068
|
+
base_path / f"-Users-*-projects-{project}",
|
|
2069
|
+
base_path / f"-Users-*-projects-{sanitized_project}"
|
|
2070
2070
|
])
|
|
2071
2071
|
else:
|
|
2072
2072
|
# Search all project directories
|
package/package.json
CHANGED
|
@@ -11,6 +11,8 @@ import hashlib
|
|
|
11
11
|
import gc
|
|
12
12
|
import ast
|
|
13
13
|
import re
|
|
14
|
+
import fcntl
|
|
15
|
+
import time
|
|
14
16
|
from pathlib import Path
|
|
15
17
|
from datetime import datetime
|
|
16
18
|
from typing import List, Dict, Any, Optional, Set
|
|
@@ -45,6 +47,10 @@ MAX_CONCEPTS = 10
|
|
|
45
47
|
MAX_AST_ELEMENTS = 30
|
|
46
48
|
MAX_CODE_BLOCKS = 5
|
|
47
49
|
MAX_ELEMENTS_PER_BLOCK = 10
|
|
50
|
+
MAX_FILES_ANALYZED = 20
|
|
51
|
+
MAX_FILES_EDITED = 20
|
|
52
|
+
MAX_TOOLS_USED = 15
|
|
53
|
+
MAX_CONCEPT_MESSAGES = 50
|
|
48
54
|
|
|
49
55
|
# Robust cross-platform state file resolution
|
|
50
56
|
def get_default_state_file():
|
|
@@ -171,7 +177,7 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
171
177
|
# Check variance is above threshold
|
|
172
178
|
import statistics
|
|
173
179
|
variance = statistics.variance(embedding)
|
|
174
|
-
if variance < 1e-
|
|
180
|
+
if variance < 1e-4: # Less strict threshold for valid embeddings
|
|
175
181
|
logger.warning(f"Low variance embedding detected: {variance}")
|
|
176
182
|
|
|
177
183
|
# Validate dimension
|
|
@@ -194,7 +200,7 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
194
200
|
"start_role": messages[0].get("role", "unknown") if messages else "unknown",
|
|
195
201
|
"message_count": len(messages),
|
|
196
202
|
"total_messages": total_messages,
|
|
197
|
-
"message_index": message_indices[0] if message_indices else
|
|
203
|
+
"message_index": message_indices[0] if message_indices else None,
|
|
198
204
|
"message_indices": message_indices # Store all indices in this chunk
|
|
199
205
|
}
|
|
200
206
|
|
|
@@ -205,16 +211,22 @@ def process_and_upload_chunk(messages: List[Dict[str, Any]], chunk_index: int,
|
|
|
205
211
|
# Create point
|
|
206
212
|
point = PointStruct(
|
|
207
213
|
id=int(point_id, 16) % (2**63),
|
|
208
|
-
vector=
|
|
214
|
+
vector=embedding, # Use validated embedding variable
|
|
209
215
|
payload=payload
|
|
210
216
|
)
|
|
211
217
|
|
|
212
|
-
# Upload
|
|
213
|
-
client.upsert(
|
|
218
|
+
# Upload with wait to ensure persistence (with retries)
|
|
219
|
+
result = _with_retries(lambda: client.upsert(
|
|
214
220
|
collection_name=collection_name,
|
|
215
221
|
points=[point],
|
|
216
|
-
wait=
|
|
217
|
-
)
|
|
222
|
+
wait=True # Ensure operation completed before continuing
|
|
223
|
+
))
|
|
224
|
+
|
|
225
|
+
# Verify the operation completed successfully (handle enum or string representations)
|
|
226
|
+
status = getattr(result, 'status', None)
|
|
227
|
+
if status and 'completed' not in str(status).lower():
|
|
228
|
+
logger.error(f"Upsert not completed for {conversation_id}:{chunk_index}, status={status}")
|
|
229
|
+
return 0
|
|
218
230
|
|
|
219
231
|
return 1
|
|
220
232
|
|
|
@@ -333,15 +345,15 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, i
|
|
|
333
345
|
if '```' in item.get('text', ''):
|
|
334
346
|
metadata['has_code_blocks'] = True
|
|
335
347
|
# Extract code for AST analysis with bounds checking
|
|
336
|
-
if len(metadata['ast_elements']) <
|
|
348
|
+
if len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
|
|
337
349
|
# Fix: More permissive regex to handle various fence formats
|
|
338
|
-
code_blocks = re.findall(r'```[
|
|
339
|
-
for code_block in code_blocks[:
|
|
340
|
-
if len(metadata['ast_elements']) >=
|
|
350
|
+
code_blocks = re.findall(r'```[^`]*?\n(.*?)```', item.get('text', ''), re.DOTALL)
|
|
351
|
+
for code_block in code_blocks[:MAX_CODE_BLOCKS]: # Use defined constant
|
|
352
|
+
if len(metadata['ast_elements']) >= MAX_AST_ELEMENTS:
|
|
341
353
|
break
|
|
342
354
|
ast_elems = extract_ast_elements(code_block)
|
|
343
|
-
for elem in list(ast_elems)[:
|
|
344
|
-
if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) <
|
|
355
|
+
for elem in list(ast_elems)[:MAX_ELEMENTS_PER_BLOCK]: # Use defined constant
|
|
356
|
+
if elem not in metadata['ast_elements'] and len(metadata['ast_elements']) < MAX_AST_ELEMENTS:
|
|
345
357
|
metadata['ast_elements'].append(elem)
|
|
346
358
|
|
|
347
359
|
elif item.get('type') == 'tool_use':
|
|
@@ -388,17 +400,17 @@ def extract_metadata_single_pass(file_path: str) -> tuple[Dict[str, Any], str, i
|
|
|
388
400
|
|
|
389
401
|
# Extract concepts from collected text
|
|
390
402
|
if all_text:
|
|
391
|
-
combined_text = ' '.join(all_text[:
|
|
403
|
+
combined_text = ' '.join(all_text[:MAX_CONCEPT_MESSAGES]) # Limit messages for concept extraction
|
|
392
404
|
metadata['concepts'] = extract_concepts(combined_text)
|
|
393
405
|
|
|
394
406
|
# Set total messages
|
|
395
407
|
metadata['total_messages'] = message_count
|
|
396
408
|
|
|
397
409
|
# Limit arrays
|
|
398
|
-
metadata['files_analyzed'] = metadata['files_analyzed'][:
|
|
399
|
-
metadata['files_edited'] = metadata['files_edited'][:
|
|
400
|
-
metadata['tools_used'] = metadata['tools_used'][:
|
|
401
|
-
metadata['ast_elements'] = metadata['ast_elements'][:
|
|
410
|
+
metadata['files_analyzed'] = metadata['files_analyzed'][:MAX_FILES_ANALYZED]
|
|
411
|
+
metadata['files_edited'] = metadata['files_edited'][:MAX_FILES_EDITED]
|
|
412
|
+
metadata['tools_used'] = metadata['tools_used'][:MAX_TOOLS_USED]
|
|
413
|
+
metadata['ast_elements'] = metadata['ast_elements'][:MAX_AST_ELEMENTS]
|
|
402
414
|
|
|
403
415
|
return metadata, first_timestamp or datetime.now().isoformat(), message_count
|
|
404
416
|
|
|
@@ -406,15 +418,32 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
406
418
|
"""Stream import a single JSONL file without loading it into memory."""
|
|
407
419
|
logger.info(f"Streaming import of {jsonl_file.name}")
|
|
408
420
|
|
|
421
|
+
# Delete existing points for this conversation to prevent stale data
|
|
422
|
+
conversation_id = jsonl_file.stem
|
|
423
|
+
try:
|
|
424
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
425
|
+
client.delete(
|
|
426
|
+
collection_name=collection_name,
|
|
427
|
+
points_selector=Filter(
|
|
428
|
+
must=[FieldCondition(key="conversation_id", match=MatchValue(value=conversation_id))]
|
|
429
|
+
),
|
|
430
|
+
wait=True
|
|
431
|
+
)
|
|
432
|
+
logger.info(f"Deleted existing points for conversation {conversation_id}")
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.warning(f"Could not delete existing points for {conversation_id}: {e}")
|
|
435
|
+
|
|
409
436
|
# Extract metadata in first pass (lightweight)
|
|
410
437
|
metadata, created_at, total_messages = extract_metadata_single_pass(str(jsonl_file))
|
|
411
438
|
|
|
439
|
+
# Reset counters for each conversation (critical for correct indexing)
|
|
440
|
+
current_message_index = 0 # Must be reset before processing each conversation
|
|
441
|
+
|
|
412
442
|
# Stream messages and process in chunks
|
|
413
443
|
chunk_buffer = []
|
|
414
444
|
chunk_index = 0
|
|
415
445
|
total_chunks = 0
|
|
416
446
|
conversation_id = jsonl_file.stem
|
|
417
|
-
current_message_index = 0
|
|
418
447
|
|
|
419
448
|
try:
|
|
420
449
|
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
@@ -434,13 +463,24 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
434
463
|
if 'message' in data and data['message']:
|
|
435
464
|
msg = data['message']
|
|
436
465
|
if msg.get('role') and msg.get('content'):
|
|
437
|
-
# Extract content
|
|
466
|
+
# Extract content from various message types
|
|
438
467
|
content = msg['content']
|
|
439
468
|
if isinstance(content, list):
|
|
440
469
|
text_parts = []
|
|
441
470
|
for item in content:
|
|
442
|
-
if isinstance(item, dict)
|
|
443
|
-
|
|
471
|
+
if isinstance(item, dict):
|
|
472
|
+
item_type = item.get('type', '')
|
|
473
|
+
if item_type == 'text':
|
|
474
|
+
text_parts.append(item.get('text', ''))
|
|
475
|
+
elif item_type == 'tool_use':
|
|
476
|
+
# Include tool use information
|
|
477
|
+
tool_name = item.get('name', 'unknown')
|
|
478
|
+
tool_input = str(item.get('input', ''))[:500] # Limit size
|
|
479
|
+
text_parts.append(f"[Tool: {tool_name}] {tool_input}")
|
|
480
|
+
elif item_type == 'tool_result':
|
|
481
|
+
# Include tool results
|
|
482
|
+
result_content = str(item.get('content', ''))[:1000] # Limit size
|
|
483
|
+
text_parts.append(f"[Result] {result_content}")
|
|
444
484
|
elif isinstance(item, str):
|
|
445
485
|
text_parts.append(item)
|
|
446
486
|
content = '\n'.join(text_parts)
|
|
@@ -448,8 +488,8 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
448
488
|
if content:
|
|
449
489
|
# Track message index for user/assistant messages
|
|
450
490
|
if msg['role'] in ['user', 'assistant']:
|
|
451
|
-
current_message_index += 1
|
|
452
491
|
message_idx = current_message_index
|
|
492
|
+
current_message_index += 1
|
|
453
493
|
else:
|
|
454
494
|
message_idx = 0
|
|
455
495
|
|
|
@@ -475,6 +515,51 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
475
515
|
# Log progress
|
|
476
516
|
if chunk_index % 10 == 0:
|
|
477
517
|
logger.info(f"Processed {chunk_index} chunks from {jsonl_file.name}")
|
|
518
|
+
|
|
519
|
+
# Handle top-level tool_result/tool_use events (no message wrapper)
|
|
520
|
+
entry_type = data.get('type')
|
|
521
|
+
if entry_type in ('tool_result', 'tool_use'):
|
|
522
|
+
text_parts = []
|
|
523
|
+
if entry_type == 'tool_use':
|
|
524
|
+
tool_name = data.get('name', 'unknown')
|
|
525
|
+
tool_input = str(data.get('input', ''))[:500]
|
|
526
|
+
text_parts.append(f"[Tool: {tool_name}] {tool_input}")
|
|
527
|
+
elif entry_type == 'tool_result':
|
|
528
|
+
# Common structures: either 'content' (list/str) or 'result'
|
|
529
|
+
result_content = data.get('content')
|
|
530
|
+
if isinstance(result_content, list):
|
|
531
|
+
# flatten to text
|
|
532
|
+
flat = []
|
|
533
|
+
for itm in result_content:
|
|
534
|
+
if isinstance(itm, dict) and itm.get('type') == 'text':
|
|
535
|
+
flat.append(itm.get('text', ''))
|
|
536
|
+
elif isinstance(itm, str):
|
|
537
|
+
flat.append(itm)
|
|
538
|
+
result_content = "\n".join(flat)
|
|
539
|
+
if not result_content:
|
|
540
|
+
result_content = data.get('result', '') # fallback key used by some tools
|
|
541
|
+
text_parts.append(f"[Result] {str(result_content)[:1000]}")
|
|
542
|
+
|
|
543
|
+
content = "\n".join([p for p in text_parts if p])
|
|
544
|
+
if content:
|
|
545
|
+
# Track message index for summary format too
|
|
546
|
+
message_idx = current_message_index
|
|
547
|
+
current_message_index += 1
|
|
548
|
+
|
|
549
|
+
chunk_buffer.append({
|
|
550
|
+
'role': entry_type,
|
|
551
|
+
'content': content,
|
|
552
|
+
'message_index': message_idx
|
|
553
|
+
})
|
|
554
|
+
if len(chunk_buffer) >= MAX_CHUNK_SIZE:
|
|
555
|
+
chunks = process_and_upload_chunk(
|
|
556
|
+
chunk_buffer, chunk_index, conversation_id,
|
|
557
|
+
created_at, metadata, collection_name, project_path, total_messages
|
|
558
|
+
)
|
|
559
|
+
total_chunks += chunks
|
|
560
|
+
chunk_buffer = []
|
|
561
|
+
chunk_index += 1
|
|
562
|
+
gc.collect()
|
|
478
563
|
|
|
479
564
|
except json.JSONDecodeError:
|
|
480
565
|
logger.debug(f"Skipping invalid JSON at line {line_num}")
|
|
@@ -496,14 +581,35 @@ def stream_import_file(jsonl_file: Path, collection_name: str, project_path: Pat
|
|
|
496
581
|
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
497
582
|
return 0
|
|
498
583
|
|
|
584
|
+
def _locked_open(path, mode):
|
|
585
|
+
"""Open file with exclusive lock for concurrent safety."""
|
|
586
|
+
f = open(path, mode)
|
|
587
|
+
try:
|
|
588
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
589
|
+
except Exception:
|
|
590
|
+
f.close()
|
|
591
|
+
raise
|
|
592
|
+
return f
|
|
593
|
+
|
|
594
|
+
def _with_retries(fn, attempts=3, base_sleep=0.5):
|
|
595
|
+
"""Execute function with retries and exponential backoff."""
|
|
596
|
+
for i in range(attempts):
|
|
597
|
+
try:
|
|
598
|
+
return fn()
|
|
599
|
+
except Exception as e:
|
|
600
|
+
if i == attempts - 1:
|
|
601
|
+
raise
|
|
602
|
+
time.sleep(base_sleep * (2 ** i))
|
|
603
|
+
logger.debug(f"Retrying after error: {e}")
|
|
604
|
+
|
|
499
605
|
def load_state() -> dict:
|
|
500
|
-
"""Load import state."""
|
|
606
|
+
"""Load import state with file locking."""
|
|
501
607
|
if os.path.exists(STATE_FILE):
|
|
502
608
|
try:
|
|
503
|
-
with
|
|
609
|
+
with _locked_open(STATE_FILE, 'r') as f:
|
|
504
610
|
return json.load(f)
|
|
505
|
-
except:
|
|
506
|
-
|
|
611
|
+
except Exception as e:
|
|
612
|
+
logger.warning(f"Failed to load state: {e}")
|
|
507
613
|
return {"imported_files": {}}
|
|
508
614
|
|
|
509
615
|
def save_state(state: dict):
|
|
@@ -513,10 +619,12 @@ def save_state(state: dict):
|
|
|
513
619
|
if state_dir:
|
|
514
620
|
os.makedirs(state_dir, exist_ok=True)
|
|
515
621
|
|
|
516
|
-
# Use atomic write to prevent corruption
|
|
622
|
+
# Use atomic write with locking to prevent corruption
|
|
517
623
|
temp_file = f"{STATE_FILE}.tmp"
|
|
518
|
-
with
|
|
624
|
+
with _locked_open(temp_file, 'w') as f:
|
|
519
625
|
json.dump(state, f, indent=2)
|
|
626
|
+
f.flush()
|
|
627
|
+
os.fsync(f.fileno())
|
|
520
628
|
|
|
521
629
|
# Atomic rename (on POSIX systems)
|
|
522
630
|
os.replace(temp_file, STATE_FILE)
|
|
@@ -527,9 +635,23 @@ def should_import_file(file_path: Path, state: dict) -> bool:
|
|
|
527
635
|
if file_str in state.get("imported_files", {}):
|
|
528
636
|
file_info = state["imported_files"][file_str]
|
|
529
637
|
last_modified = file_path.stat().st_mtime
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
638
|
+
|
|
639
|
+
# Check if file has been modified
|
|
640
|
+
if file_info.get("last_modified") != last_modified:
|
|
641
|
+
logger.info(f"File modified, will re-import: {file_path.name}")
|
|
642
|
+
return True
|
|
643
|
+
|
|
644
|
+
# Check for suspiciously low chunk counts (likely failed imports)
|
|
645
|
+
chunks = file_info.get("chunks", 0)
|
|
646
|
+
file_size_kb = file_path.stat().st_size / 1024
|
|
647
|
+
|
|
648
|
+
# Heuristic: Files > 10KB should have more than 2 chunks
|
|
649
|
+
if file_size_kb > 10 and chunks <= 2:
|
|
650
|
+
logger.warning(f"File has suspiciously low chunks ({chunks}) for size {file_size_kb:.1f}KB, will re-import: {file_path.name}")
|
|
651
|
+
return True
|
|
652
|
+
|
|
653
|
+
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
654
|
+
return False
|
|
533
655
|
return True
|
|
534
656
|
|
|
535
657
|
def update_file_state(file_path: Path, state: dict, chunks: int):
|
|
@@ -585,12 +707,37 @@ def main():
|
|
|
585
707
|
if should_import_file(jsonl_file, state):
|
|
586
708
|
chunks = stream_import_file(jsonl_file, collection_name, project_dir)
|
|
587
709
|
if chunks > 0:
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
710
|
+
# Verify data is actually in Qdrant before marking as imported
|
|
711
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
712
|
+
try:
|
|
713
|
+
conversation_id = jsonl_file.stem
|
|
714
|
+
count_result = _with_retries(lambda: client.count(
|
|
715
|
+
collection_name=collection_name,
|
|
716
|
+
count_filter=Filter(
|
|
717
|
+
must=[FieldCondition(key="conversation_id",
|
|
718
|
+
match=MatchValue(value=conversation_id))]
|
|
719
|
+
),
|
|
720
|
+
exact=True # Ensure exact count, not approximation
|
|
721
|
+
))
|
|
722
|
+
actual_count = count_result.count if hasattr(count_result, 'count') else 0
|
|
723
|
+
|
|
724
|
+
if actual_count > 0:
|
|
725
|
+
logger.info(f"Verified {actual_count} points in Qdrant for {conversation_id}")
|
|
726
|
+
update_file_state(jsonl_file, state, chunks)
|
|
727
|
+
save_state(state)
|
|
728
|
+
total_imported += 1
|
|
729
|
+
else:
|
|
730
|
+
logger.error(f"No points found in Qdrant for {conversation_id} despite {chunks} chunks processed - not marking as imported")
|
|
731
|
+
except Exception as e:
|
|
732
|
+
logger.error(f"Failed to verify Qdrant points for {jsonl_file.name}: {e}")
|
|
733
|
+
# Don't mark as imported if we can't verify
|
|
591
734
|
|
|
592
735
|
# Force GC after each file
|
|
593
736
|
gc.collect()
|
|
737
|
+
else:
|
|
738
|
+
# Critical fix: Don't mark files with 0 chunks as imported
|
|
739
|
+
# This allows retry on next run
|
|
740
|
+
logger.warning(f"File produced 0 chunks, not marking as imported: {jsonl_file.name}")
|
|
594
741
|
|
|
595
742
|
logger.info(f"Import complete: processed {total_imported} files")
|
|
596
743
|
|