claude-self-reflect 2.4.14 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/open-source-maintainer.md +94 -8
- package/Dockerfile.watcher +7 -0
- package/README.md +4 -0
- package/docker-compose.yaml +5 -4
- package/mcp-server/pyproject.toml +1 -1
- package/mcp-server/src/server.py +217 -0
- package/package.json +1 -1
- package/scripts/import-conversations-enhanced.py +672 -0
- package/scripts/import-conversations-unified.py +3 -1
- package/scripts/import-watcher.py +75 -20
|
@@ -175,20 +175,76 @@ safety check -r mcp-server/requirements.txt
|
|
|
175
175
|
# For Node: npm test
|
|
176
176
|
```
|
|
177
177
|
|
|
178
|
-
#### 5. Release
|
|
178
|
+
#### 4.5. Create Professional Release Notes
|
|
179
179
|
```bash
|
|
180
|
+
# Create release notes file
|
|
181
|
+
VERSION=$(node -p "require('./package.json').version")
|
|
182
|
+
cat > docs/RELEASE_NOTES_v${VERSION}.md << 'EOF'
|
|
183
|
+
# Release Notes - v${VERSION}
|
|
184
|
+
|
|
185
|
+
## Summary
|
|
186
|
+
Brief description of what this release addresses and why it matters.
|
|
187
|
+
|
|
188
|
+
## Changes
|
|
189
|
+
|
|
190
|
+
### Bug Fixes
|
|
191
|
+
- Fixed global npm installation failing due to Docker build context issues (#13)
|
|
192
|
+
- Modified Dockerfile.importer to embed Python dependencies directly
|
|
193
|
+
- Removed dependency on external requirements.txt file during build
|
|
194
|
+
- Ensures compatibility with both local development and global npm installations
|
|
195
|
+
|
|
196
|
+
### Technical Details
|
|
197
|
+
- Files modified:
|
|
198
|
+
- `Dockerfile.importer`: Embedded Python dependencies inline
|
|
199
|
+
- Removed COPY instruction for scripts that are volume-mounted at runtime
|
|
200
|
+
|
|
201
|
+
### Verification
|
|
202
|
+
- Docker builds tested successfully in isolation
|
|
203
|
+
- Import process verified to skip already imported files
|
|
204
|
+
- Both local and global npm installation paths validated
|
|
205
|
+
|
|
206
|
+
## Installation
|
|
207
|
+
```bash
|
|
208
|
+
npm install -g claude-self-reflect@${VERSION}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## Contributors
|
|
212
|
+
Thank you to everyone who reported issues and helped test this release:
|
|
213
|
+
- @mattias012 - Reported npm global installation issue
|
|
214
|
+
- @vbp1 - Confirmed Docker setup problems
|
|
215
|
+
|
|
216
|
+
## Related Issues
|
|
217
|
+
- Resolves #13: Global npm installation Docker build failures
|
|
218
|
+
EOF
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
#### 5. Version Bump & Release Creation
|
|
222
|
+
```bash
|
|
223
|
+
# Update package.json version BEFORE creating tag
|
|
224
|
+
# Determine version bump type based on changes:
|
|
225
|
+
# - patch: bug fixes, minor updates (2.4.10 -> 2.4.11)
|
|
226
|
+
# - minor: new features, non-breaking changes (2.4.10 -> 2.5.0)
|
|
227
|
+
# - major: breaking changes (2.4.10 -> 3.0.0)
|
|
228
|
+
npm version patch --no-git-tag-version # Updates package.json and package-lock.json
|
|
229
|
+
|
|
230
|
+
# Commit version bump
|
|
231
|
+
VERSION=$(node -p "require('./package.json').version")
|
|
232
|
+
git add package.json package-lock.json
|
|
233
|
+
git commit -m "chore: bump version to ${VERSION} for release"
|
|
234
|
+
git push origin main
|
|
235
|
+
|
|
180
236
|
# Create and push tag
|
|
181
|
-
git tag -a
|
|
182
|
-
git push origin
|
|
237
|
+
git tag -a v${VERSION} -m "Release v${VERSION} - Brief description"
|
|
238
|
+
git push origin v${VERSION}
|
|
183
239
|
|
|
184
240
|
# Create GitHub release
|
|
185
|
-
gh release create
|
|
186
|
-
--title "
|
|
187
|
-
--notes-file docs/
|
|
241
|
+
gh release create v${VERSION} \
|
|
242
|
+
--title "v${VERSION} - Release Title" \
|
|
243
|
+
--notes-file docs/RELEASE_NOTES_v${VERSION}.md \
|
|
188
244
|
--target main
|
|
189
245
|
|
|
190
246
|
# Monitor the release workflow
|
|
191
|
-
echo "
|
|
247
|
+
echo "Release created! Monitoring automated publishing..."
|
|
192
248
|
gh run list --workflow "CI/CD Pipeline" --limit 1
|
|
193
249
|
gh run watch
|
|
194
250
|
```
|
|
@@ -207,7 +263,7 @@ echo "⏳ Waiting for automated npm publish..."
|
|
|
207
263
|
# Monitor the release workflow until npm publish completes
|
|
208
264
|
```
|
|
209
265
|
|
|
210
|
-
#### 7. Post-Release Verification
|
|
266
|
+
#### 7. Post-Release Verification & Issue Management
|
|
211
267
|
```bash
|
|
212
268
|
# Verify GitHub release
|
|
213
269
|
gh release view vX.Y.Z
|
|
@@ -217,6 +273,36 @@ npm view claude-self-reflect version
|
|
|
217
273
|
|
|
218
274
|
# Check that related PRs are closed
|
|
219
275
|
gh pr list --state closed --limit 10
|
|
276
|
+
|
|
277
|
+
# Handle related issues professionally
|
|
278
|
+
# For each issue addressed in this release:
|
|
279
|
+
ISSUE_NUMBER=13 # Example
|
|
280
|
+
VERSION=$(node -p "require('./package.json').version")
|
|
281
|
+
|
|
282
|
+
# Determine if issue should be closed or kept open
|
|
283
|
+
# Close if: bug fixed, feature implemented, question answered
|
|
284
|
+
# Keep open if: partial fix, needs more work, ongoing discussion
|
|
285
|
+
|
|
286
|
+
# Professional comment template (no emojis, clear references)
|
|
287
|
+
gh issue comment $ISSUE_NUMBER --body "Thank you for reporting this issue. The global npm installation problem has been addressed in release v${VERSION}.
|
|
288
|
+
|
|
289
|
+
The fix involved modifying the Docker build process to embed dependencies directly:
|
|
290
|
+
- Modified: Dockerfile.importer - Embedded Python dependencies to avoid file path issues
|
|
291
|
+
- Verified: Docker builds work correctly without requiring scripts directory in build context
|
|
292
|
+
- Tested: Import process correctly skips already imported files
|
|
293
|
+
|
|
294
|
+
You can update to the latest version with:
|
|
295
|
+
\`\`\`bash
|
|
296
|
+
npm install -g claude-self-reflect@${VERSION}
|
|
297
|
+
\`\`\`
|
|
298
|
+
|
|
299
|
+
Please let us know if you encounter any issues with the new version."
|
|
300
|
+
|
|
301
|
+
# Close the issue if fully resolved
|
|
302
|
+
gh issue close $ISSUE_NUMBER --comment "Closing as resolved in v${VERSION}. Feel free to reopen if you encounter any related issues."
|
|
303
|
+
|
|
304
|
+
# Or keep open with status update if partially resolved
|
|
305
|
+
# gh issue comment $ISSUE_NUMBER --body "Partial fix implemented in v${VERSION}. Keeping this issue open to track remaining work on [specific aspect]."
|
|
220
306
|
```
|
|
221
307
|
|
|
222
308
|
#### 8. Rollback Procedures
|
package/Dockerfile.watcher
CHANGED
|
@@ -20,12 +20,19 @@ RUN pip install --no-cache-dir \
|
|
|
20
20
|
# Create non-root user
|
|
21
21
|
RUN useradd -m -u 1000 watcher
|
|
22
22
|
|
|
23
|
+
# Pre-download FastEmbed model to avoid runtime downloads
|
|
24
|
+
RUN mkdir -p /home/watcher/.cache && \
|
|
25
|
+
FASTEMBED_CACHE_PATH=/home/watcher/.cache/fastembed python -c "from fastembed import TextEmbedding; import os; os.environ['FASTEMBED_CACHE_PATH']='/home/watcher/.cache/fastembed'; TextEmbedding('sentence-transformers/all-MiniLM-L6-v2')" && \
|
|
26
|
+
chown -R watcher:watcher /home/watcher/.cache
|
|
27
|
+
|
|
23
28
|
# Create scripts directory and copy required files
|
|
24
29
|
RUN mkdir -p /scripts
|
|
25
30
|
|
|
26
31
|
# Copy all necessary scripts
|
|
27
32
|
COPY scripts/import-conversations-unified.py /scripts/
|
|
28
33
|
COPY scripts/import-watcher.py /scripts/
|
|
34
|
+
COPY scripts/utils.py /scripts/
|
|
35
|
+
COPY scripts/trigger-import.py /scripts/
|
|
29
36
|
|
|
30
37
|
RUN chmod +x /scripts/*.py
|
|
31
38
|
|
package/README.md
CHANGED
|
@@ -203,6 +203,10 @@ Recent conversations matter more. Old ones fade. Like your brain, but reliable.
|
|
|
203
203
|
|
|
204
204
|
Works perfectly out of the box. [Configure if you're particular](docs/memory-decay.md).
|
|
205
205
|
|
|
206
|
+
## Theoretical Foundation
|
|
207
|
+
|
|
208
|
+
Claude Self-Reflect addresses the "reality gap" in AI memory systems - the distance between perfect recall expectations and practical utility. Our approach aligns with the SPAR Framework (Sense, Plan, Act, Reflect) for agentic AI systems. [Learn more about our design philosophy](docs/architecture/SPAR-alignment.md).
|
|
209
|
+
|
|
206
210
|
## For the Skeptics
|
|
207
211
|
|
|
208
212
|
**"Just use grep"** - Sure, enjoy your 10,000 matches for "database"
|
package/docker-compose.yaml
CHANGED
|
@@ -22,8 +22,8 @@ services:
|
|
|
22
22
|
- QDRANT__LOG_LEVEL=INFO
|
|
23
23
|
- QDRANT__SERVICE__HTTP_PORT=6333
|
|
24
24
|
restart: unless-stopped
|
|
25
|
-
mem_limit: ${QDRANT_MEMORY:-
|
|
26
|
-
memswap_limit: ${QDRANT_MEMORY:-
|
|
25
|
+
mem_limit: ${QDRANT_MEMORY:-2g}
|
|
26
|
+
memswap_limit: ${QDRANT_MEMORY:-2g}
|
|
27
27
|
|
|
28
28
|
# One-time import service (runs once then exits)
|
|
29
29
|
importer:
|
|
@@ -66,6 +66,7 @@ services:
|
|
|
66
66
|
- ${CLAUDE_LOGS_PATH:-~/.claude/projects}:/logs:ro
|
|
67
67
|
- ${CONFIG_PATH:-~/.claude-self-reflect/config}:/config
|
|
68
68
|
- ./scripts:/scripts:ro
|
|
69
|
+
- /tmp:/tmp
|
|
69
70
|
environment:
|
|
70
71
|
- QDRANT_URL=http://qdrant:6333
|
|
71
72
|
- STATE_FILE=/config/imported-files.json
|
|
@@ -78,8 +79,8 @@ services:
|
|
|
78
79
|
- PYTHONUNBUFFERED=1
|
|
79
80
|
restart: unless-stopped
|
|
80
81
|
profiles: ["watch"]
|
|
81
|
-
mem_limit:
|
|
82
|
-
memswap_limit:
|
|
82
|
+
mem_limit: 1g
|
|
83
|
+
memswap_limit: 1g
|
|
83
84
|
|
|
84
85
|
# MCP server for Claude integration
|
|
85
86
|
mcp-server:
|
package/mcp-server/src/server.py
CHANGED
|
@@ -887,6 +887,223 @@ async def get_more_results(
|
|
|
887
887
|
return response
|
|
888
888
|
|
|
889
889
|
|
|
890
|
+
@mcp.tool()
|
|
891
|
+
async def search_by_file(
|
|
892
|
+
ctx: Context,
|
|
893
|
+
file_path: str = Field(description="The file path to search for in conversations"),
|
|
894
|
+
limit: int = Field(default=10, description="Maximum number of results to return"),
|
|
895
|
+
project: Optional[str] = Field(default=None, description="Search specific project only. Use 'all' to search across all projects.")
|
|
896
|
+
) -> str:
|
|
897
|
+
"""Search for conversations that analyzed a specific file."""
|
|
898
|
+
global qdrant_client
|
|
899
|
+
|
|
900
|
+
# Normalize file path
|
|
901
|
+
normalized_path = file_path.replace("\\", "/").replace("/Users/", "~/")
|
|
902
|
+
|
|
903
|
+
# Determine which collections to search
|
|
904
|
+
# If no project specified, search all collections
|
|
905
|
+
collections = await get_all_collections() if not project else []
|
|
906
|
+
|
|
907
|
+
if project and project != 'all':
|
|
908
|
+
# Filter collections for specific project
|
|
909
|
+
project_hash = hashlib.md5(project.encode()).hexdigest()[:8]
|
|
910
|
+
collection_prefix = f"conv_{project_hash}_"
|
|
911
|
+
collections = [c for c in await get_all_collections() if c.startswith(collection_prefix)]
|
|
912
|
+
elif project == 'all':
|
|
913
|
+
collections = await get_all_collections()
|
|
914
|
+
|
|
915
|
+
if not collections:
|
|
916
|
+
return "<search_by_file>\n<error>No collections found to search</error>\n</search_by_file>"
|
|
917
|
+
|
|
918
|
+
# Prepare results
|
|
919
|
+
all_results = []
|
|
920
|
+
|
|
921
|
+
for collection_name in collections:
|
|
922
|
+
try:
|
|
923
|
+
# Use scroll to get all points and filter manually
|
|
924
|
+
# Qdrant's array filtering can be tricky, so we'll filter in code
|
|
925
|
+
scroll_result = await qdrant_client.scroll(
|
|
926
|
+
collection_name=collection_name,
|
|
927
|
+
limit=1000, # Get a batch
|
|
928
|
+
with_payload=True
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
# Filter results that contain the file
|
|
932
|
+
for point in scroll_result[0]:
|
|
933
|
+
payload = point.payload
|
|
934
|
+
files_analyzed = payload.get('files_analyzed', [])
|
|
935
|
+
files_edited = payload.get('files_edited', [])
|
|
936
|
+
|
|
937
|
+
if normalized_path in files_analyzed or normalized_path in files_edited:
|
|
938
|
+
all_results.append({
|
|
939
|
+
'score': 1.0, # File match is always 1.0
|
|
940
|
+
'payload': payload,
|
|
941
|
+
'collection': collection_name
|
|
942
|
+
})
|
|
943
|
+
|
|
944
|
+
except Exception as e:
|
|
945
|
+
continue
|
|
946
|
+
|
|
947
|
+
# Sort by timestamp (newest first)
|
|
948
|
+
all_results.sort(key=lambda x: x['payload'].get('timestamp', ''), reverse=True)
|
|
949
|
+
|
|
950
|
+
# Format results
|
|
951
|
+
if not all_results:
|
|
952
|
+
return f"""<search_by_file>
|
|
953
|
+
<query>{file_path}</query>
|
|
954
|
+
<normalized_path>{normalized_path}</normalized_path>
|
|
955
|
+
<message>No conversations found that analyzed this file</message>
|
|
956
|
+
</search_by_file>"""
|
|
957
|
+
|
|
958
|
+
results_text = []
|
|
959
|
+
for i, result in enumerate(all_results[:limit]):
|
|
960
|
+
payload = result['payload']
|
|
961
|
+
timestamp = payload.get('timestamp', 'Unknown')
|
|
962
|
+
conversation_id = payload.get('conversation_id', 'Unknown')
|
|
963
|
+
project = payload.get('project', 'Unknown')
|
|
964
|
+
text_preview = payload.get('text', '')[:200] + '...' if len(payload.get('text', '')) > 200 else payload.get('text', '')
|
|
965
|
+
|
|
966
|
+
# Check if file was edited or just read
|
|
967
|
+
action = "edited" if normalized_path in payload.get('files_edited', []) else "analyzed"
|
|
968
|
+
|
|
969
|
+
# Get related tools used
|
|
970
|
+
tool_summary = payload.get('tool_summary', {})
|
|
971
|
+
tools_used = ', '.join(f"{tool}({count})" for tool, count in tool_summary.items())
|
|
972
|
+
|
|
973
|
+
results_text.append(f"""<result rank="{i+1}">
|
|
974
|
+
<conversation_id>{conversation_id}</conversation_id>
|
|
975
|
+
<project>{project}</project>
|
|
976
|
+
<timestamp>{timestamp}</timestamp>
|
|
977
|
+
<action>{action}</action>
|
|
978
|
+
<tools_used>{tools_used}</tools_used>
|
|
979
|
+
<preview>{text_preview}</preview>
|
|
980
|
+
</result>""")
|
|
981
|
+
|
|
982
|
+
return f"""<search_by_file>
|
|
983
|
+
<query>{file_path}</query>
|
|
984
|
+
<normalized_path>{normalized_path}</normalized_path>
|
|
985
|
+
<count>{len(all_results)}</count>
|
|
986
|
+
<results>
|
|
987
|
+
{''.join(results_text)}
|
|
988
|
+
</results>
|
|
989
|
+
</search_by_file>"""
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
@mcp.tool()
|
|
993
|
+
async def search_by_concept(
|
|
994
|
+
ctx: Context,
|
|
995
|
+
concept: str = Field(description="The concept to search for (e.g., 'security', 'docker', 'testing')"),
|
|
996
|
+
include_files: bool = Field(default=True, description="Include file information in results"),
|
|
997
|
+
limit: int = Field(default=10, description="Maximum number of results to return"),
|
|
998
|
+
project: Optional[str] = Field(default=None, description="Search specific project only. Use 'all' to search across all projects.")
|
|
999
|
+
) -> str:
|
|
1000
|
+
"""Search for conversations about a specific development concept."""
|
|
1001
|
+
global qdrant_client
|
|
1002
|
+
|
|
1003
|
+
# Generate embedding for the concept
|
|
1004
|
+
embedding = await generate_embedding(concept)
|
|
1005
|
+
|
|
1006
|
+
# Determine which collections to search
|
|
1007
|
+
# If no project specified, search all collections
|
|
1008
|
+
collections = await get_all_collections() if not project else []
|
|
1009
|
+
|
|
1010
|
+
if project and project != 'all':
|
|
1011
|
+
# Filter collections for specific project
|
|
1012
|
+
project_hash = hashlib.md5(project.encode()).hexdigest()[:8]
|
|
1013
|
+
collection_prefix = f"conv_{project_hash}_"
|
|
1014
|
+
collections = [c for c in await get_all_collections() if c.startswith(collection_prefix)]
|
|
1015
|
+
elif project == 'all':
|
|
1016
|
+
collections = await get_all_collections()
|
|
1017
|
+
|
|
1018
|
+
if not collections:
|
|
1019
|
+
return "<search_by_concept>\n<error>No collections found to search</error>\n</search_by_concept>"
|
|
1020
|
+
|
|
1021
|
+
# Search all collections
|
|
1022
|
+
all_results = []
|
|
1023
|
+
|
|
1024
|
+
for collection_name in collections:
|
|
1025
|
+
try:
|
|
1026
|
+
# Hybrid search: semantic + concept filter
|
|
1027
|
+
results = await qdrant_client.search(
|
|
1028
|
+
collection_name=collection_name,
|
|
1029
|
+
query_vector=embedding,
|
|
1030
|
+
query_filter=models.Filter(
|
|
1031
|
+
should=[
|
|
1032
|
+
models.FieldCondition(
|
|
1033
|
+
key="concepts",
|
|
1034
|
+
match=models.MatchAny(any=[concept.lower()])
|
|
1035
|
+
)
|
|
1036
|
+
]
|
|
1037
|
+
),
|
|
1038
|
+
limit=limit * 2, # Get more results for better filtering
|
|
1039
|
+
with_payload=True
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
for point in results:
|
|
1043
|
+
payload = point.payload
|
|
1044
|
+
# Boost score if concept is in the concepts list
|
|
1045
|
+
score_boost = 0.2 if concept.lower() in payload.get('concepts', []) else 0.0
|
|
1046
|
+
all_results.append({
|
|
1047
|
+
'score': float(point.score) + score_boost,
|
|
1048
|
+
'payload': payload,
|
|
1049
|
+
'collection': collection_name
|
|
1050
|
+
})
|
|
1051
|
+
|
|
1052
|
+
except Exception as e:
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
# Sort by score and limit
|
|
1056
|
+
all_results.sort(key=lambda x: x['score'], reverse=True)
|
|
1057
|
+
all_results = all_results[:limit]
|
|
1058
|
+
|
|
1059
|
+
# Format results
|
|
1060
|
+
if not all_results:
|
|
1061
|
+
return f"""<search_by_concept>
|
|
1062
|
+
<concept>{concept}</concept>
|
|
1063
|
+
<message>No conversations found about this concept</message>
|
|
1064
|
+
</search_by_concept>"""
|
|
1065
|
+
|
|
1066
|
+
results_text = []
|
|
1067
|
+
for i, result in enumerate(all_results):
|
|
1068
|
+
payload = result['payload']
|
|
1069
|
+
score = result['score']
|
|
1070
|
+
timestamp = payload.get('timestamp', 'Unknown')
|
|
1071
|
+
conversation_id = payload.get('conversation_id', 'Unknown')
|
|
1072
|
+
project = payload.get('project', 'Unknown')
|
|
1073
|
+
concepts = payload.get('concepts', [])
|
|
1074
|
+
|
|
1075
|
+
# Get text preview
|
|
1076
|
+
text_preview = payload.get('text', '')[:200] + '...' if len(payload.get('text', '')) > 200 else payload.get('text', '')
|
|
1077
|
+
|
|
1078
|
+
# File information
|
|
1079
|
+
files_info = ""
|
|
1080
|
+
if include_files:
|
|
1081
|
+
files_analyzed = payload.get('files_analyzed', [])[:5]
|
|
1082
|
+
if files_analyzed:
|
|
1083
|
+
files_info = f"\n<files_analyzed>{', '.join(files_analyzed)}</files_analyzed>"
|
|
1084
|
+
|
|
1085
|
+
# Related concepts
|
|
1086
|
+
related_concepts = [c for c in concepts if c != concept.lower()][:5]
|
|
1087
|
+
|
|
1088
|
+
results_text.append(f"""<result rank="{i+1}">
|
|
1089
|
+
<score>{score:.3f}</score>
|
|
1090
|
+
<conversation_id>{conversation_id}</conversation_id>
|
|
1091
|
+
<project>{project}</project>
|
|
1092
|
+
<timestamp>{timestamp}</timestamp>
|
|
1093
|
+
<concepts>{', '.join(concepts)}</concepts>
|
|
1094
|
+
<related_concepts>{', '.join(related_concepts)}</related_concepts>{files_info}
|
|
1095
|
+
<preview>{text_preview}</preview>
|
|
1096
|
+
</result>""")
|
|
1097
|
+
|
|
1098
|
+
return f"""<search_by_concept>
|
|
1099
|
+
<concept>{concept}</concept>
|
|
1100
|
+
<count>{len(all_results)}</count>
|
|
1101
|
+
<results>
|
|
1102
|
+
{''.join(results_text)}
|
|
1103
|
+
</results>
|
|
1104
|
+
</search_by_concept>"""
|
|
1105
|
+
|
|
1106
|
+
|
|
890
1107
|
# Debug output
|
|
891
1108
|
print(f"[DEBUG] FastMCP server created with name: {mcp.name}")
|
|
892
1109
|
|
package/package.json
CHANGED
|
@@ -0,0 +1,672 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Enhanced import script that extracts tool usage metadata from conversations.
|
|
4
|
+
Supports both local and Voyage AI embeddings with tool tracking.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import json
|
|
10
|
+
import glob
|
|
11
|
+
import hashlib
|
|
12
|
+
import gc
|
|
13
|
+
import re
|
|
14
|
+
import time
|
|
15
|
+
from datetime import datetime, timedelta
|
|
16
|
+
from typing import List, Dict, Any, Set, Tuple
|
|
17
|
+
import logging
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from qdrant_client import QdrantClient
|
|
21
|
+
from qdrant_client.models import (
|
|
22
|
+
VectorParams, Distance, PointStruct,
|
|
23
|
+
Filter, FieldCondition, MatchValue
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from tenacity import (
|
|
27
|
+
retry,
|
|
28
|
+
stop_after_attempt,
|
|
29
|
+
wait_random_exponential,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Configuration
|
|
33
|
+
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
34
|
+
LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
|
|
35
|
+
STATE_FILE = os.getenv("STATE_FILE", "./config/imported-files-enhanced.json")
|
|
36
|
+
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
|
|
37
|
+
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
|
|
38
|
+
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
39
|
+
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
|
|
40
|
+
|
|
41
|
+
# Set up logging
|
|
42
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
# Import timing stats
|
|
46
|
+
timing_stats = {
|
|
47
|
+
"extract": [],
|
|
48
|
+
"chunk": [],
|
|
49
|
+
"embed": [],
|
|
50
|
+
"store": [],
|
|
51
|
+
"total": []
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
def normalize_path(path: str) -> str:
|
|
55
|
+
"""Normalize file paths for consistency across platforms."""
|
|
56
|
+
if not path:
|
|
57
|
+
return ""
|
|
58
|
+
|
|
59
|
+
# Remove common prefixes
|
|
60
|
+
path = path.replace("/Users/", "~/")
|
|
61
|
+
path = path.replace("\\Users\\", "~\\")
|
|
62
|
+
|
|
63
|
+
# Convert to forward slashes
|
|
64
|
+
path = path.replace("\\", "/")
|
|
65
|
+
|
|
66
|
+
# Remove duplicate slashes
|
|
67
|
+
path = re.sub(r'/+', '/', path)
|
|
68
|
+
|
|
69
|
+
return path
|
|
70
|
+
|
|
71
|
+
def extract_concepts(text: str, tool_usage: Dict[str, Any]) -> Set[str]:
|
|
72
|
+
"""Extract high-level concepts from conversation and tool usage."""
|
|
73
|
+
concepts = set()
|
|
74
|
+
|
|
75
|
+
# Common development concepts with patterns
|
|
76
|
+
concept_patterns = {
|
|
77
|
+
'security': r'(security|vulnerability|CVE|injection|sanitize|escape|auth|token|JWT)',
|
|
78
|
+
'performance': r'(performance|optimization|speed|memory|efficient|benchmark|latency)',
|
|
79
|
+
'testing': r'(test|pytest|unittest|coverage|TDD|spec|assert)',
|
|
80
|
+
'docker': r'(docker|container|compose|dockerfile|kubernetes|k8s)',
|
|
81
|
+
'api': r'(API|REST|GraphQL|endpoint|webhook|http|request)',
|
|
82
|
+
'database': r'(database|SQL|query|migration|schema|postgres|mysql|mongodb)',
|
|
83
|
+
'authentication': r'(auth|login|token|JWT|session|oauth|permission)',
|
|
84
|
+
'debugging': r'(debug|error|exception|traceback|log|stack|trace)',
|
|
85
|
+
'refactoring': r'(refactor|cleanup|improve|restructure|optimize|technical debt)',
|
|
86
|
+
'deployment': r'(deploy|CI/CD|release|production|staging|rollout)',
|
|
87
|
+
'git': r'(git|commit|branch|merge|pull request|PR|rebase)',
|
|
88
|
+
'architecture': r'(architecture|design|pattern|structure|component|module)',
|
|
89
|
+
'mcp': r'(MCP|claude-self-reflect|tool|agent|claude code)',
|
|
90
|
+
'embeddings': r'(embedding|vector|semantic|similarity|fastembed|voyage)',
|
|
91
|
+
'search': r'(search|query|find|filter|match|relevance)'
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Check text content
|
|
95
|
+
combined_text = text.lower()
|
|
96
|
+
for concept, pattern in concept_patterns.items():
|
|
97
|
+
if re.search(pattern, combined_text, re.IGNORECASE):
|
|
98
|
+
concepts.add(concept)
|
|
99
|
+
|
|
100
|
+
# Check tool usage patterns
|
|
101
|
+
tool_text = json.dumps(tool_usage).lower()
|
|
102
|
+
for concept, pattern in concept_patterns.items():
|
|
103
|
+
if re.search(pattern, tool_text, re.IGNORECASE):
|
|
104
|
+
concepts.add(concept)
|
|
105
|
+
|
|
106
|
+
# Add concepts based on specific tool usage
|
|
107
|
+
if tool_usage.get('grep_searches'):
|
|
108
|
+
concepts.add('search')
|
|
109
|
+
if tool_usage.get('files_edited') or tool_usage.get('files_created'):
|
|
110
|
+
concepts.add('development')
|
|
111
|
+
if any('test' in str(f).lower() for f in tool_usage.get('files_read', [])):
|
|
112
|
+
concepts.add('testing')
|
|
113
|
+
if any('docker' in str(cmd).lower() for cmd in tool_usage.get('bash_commands', [])):
|
|
114
|
+
concepts.add('docker')
|
|
115
|
+
|
|
116
|
+
return concepts
|
|
117
|
+
|
|
118
|
+
def extract_tool_usage_from_jsonl(jsonl_path: str) -> Dict[str, Any]:
|
|
119
|
+
"""Extract all tool usage from a conversation."""
|
|
120
|
+
tool_usage = {
|
|
121
|
+
"files_read": [],
|
|
122
|
+
"files_edited": [],
|
|
123
|
+
"files_created": [],
|
|
124
|
+
"grep_searches": [],
|
|
125
|
+
"bash_commands": [],
|
|
126
|
+
"glob_patterns": [],
|
|
127
|
+
"task_calls": [],
|
|
128
|
+
"mcp_calls": [],
|
|
129
|
+
"tools_summary": {},
|
|
130
|
+
"concepts": set(),
|
|
131
|
+
"timing": {},
|
|
132
|
+
"errors": [],
|
|
133
|
+
"tool_results": {}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
start_time = time.time()
|
|
137
|
+
|
|
138
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
139
|
+
for line_num, line in enumerate(f, 1):
|
|
140
|
+
line = line.strip()
|
|
141
|
+
if not line:
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
data = json.loads(line)
|
|
146
|
+
|
|
147
|
+
# Skip API error messages
|
|
148
|
+
if data.get('isApiErrorMessage'):
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
# Process message content
|
|
152
|
+
if 'message' in data and 'content' in data['message']:
|
|
153
|
+
content = data['message']['content']
|
|
154
|
+
|
|
155
|
+
# Handle content array (where tool_use lives)
|
|
156
|
+
if isinstance(content, list):
|
|
157
|
+
for item in content:
|
|
158
|
+
if isinstance(item, dict) and item.get('type') == 'tool_use':
|
|
159
|
+
extract_single_tool_use(item, tool_usage)
|
|
160
|
+
|
|
161
|
+
except json.JSONDecodeError as e:
|
|
162
|
+
logger.debug(f"Skipping invalid JSON at line {line_num}: {e}")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Error processing line {line_num}: {e}")
|
|
165
|
+
tool_usage["errors"].append({"line": line_num, "error": str(e)})
|
|
166
|
+
|
|
167
|
+
# Calculate timing
|
|
168
|
+
tool_usage["timing"]["extract_ms"] = int((time.time() - start_time) * 1000)
|
|
169
|
+
|
|
170
|
+
# Convert sets to lists for JSON serialization
|
|
171
|
+
tool_usage["concepts"] = list(tool_usage["concepts"])
|
|
172
|
+
|
|
173
|
+
return tool_usage
|
|
174
|
+
|
|
175
|
+
def extract_single_tool_use(tool_data: Dict[str, Any], usage_dict: Dict[str, Any]) -> None:
|
|
176
|
+
"""Parse individual tool usage with enhanced metadata extraction."""
|
|
177
|
+
tool_name = tool_data.get('name')
|
|
178
|
+
inputs = tool_data.get('input', {})
|
|
179
|
+
tool_id = tool_data.get('id')
|
|
180
|
+
|
|
181
|
+
# Track tool frequency
|
|
182
|
+
usage_dict['tools_summary'][tool_name] = usage_dict['tools_summary'].get(tool_name, 0) + 1
|
|
183
|
+
|
|
184
|
+
# Extract based on tool type
|
|
185
|
+
if tool_name == 'Read':
|
|
186
|
+
path = inputs.get('file_path')
|
|
187
|
+
if path:
|
|
188
|
+
usage_dict['files_read'].append({
|
|
189
|
+
'path': normalize_path(path),
|
|
190
|
+
'offset': inputs.get('offset', 0),
|
|
191
|
+
'limit': inputs.get('limit', -1),
|
|
192
|
+
'tool_id': tool_id
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
elif tool_name == 'Grep':
|
|
196
|
+
pattern = inputs.get('pattern')
|
|
197
|
+
if pattern:
|
|
198
|
+
usage_dict['grep_searches'].append({
|
|
199
|
+
'pattern': pattern[:100], # Limit pattern length
|
|
200
|
+
'path': normalize_path(inputs.get('path', '.')),
|
|
201
|
+
'glob': inputs.get('glob'),
|
|
202
|
+
'output_mode': inputs.get('output_mode', 'files_with_matches'),
|
|
203
|
+
'case_insensitive': inputs.get('-i', False)
|
|
204
|
+
})
|
|
205
|
+
# Add search concept
|
|
206
|
+
usage_dict['concepts'].add('search')
|
|
207
|
+
|
|
208
|
+
elif tool_name == 'Edit' or tool_name == 'MultiEdit':
|
|
209
|
+
path = inputs.get('file_path')
|
|
210
|
+
if path:
|
|
211
|
+
usage_dict['files_edited'].append({
|
|
212
|
+
'path': normalize_path(path),
|
|
213
|
+
'operation': tool_name.lower()
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
elif tool_name == 'Write':
|
|
217
|
+
path = inputs.get('file_path')
|
|
218
|
+
if path:
|
|
219
|
+
usage_dict['files_created'].append(normalize_path(path))
|
|
220
|
+
|
|
221
|
+
elif tool_name == 'Bash':
|
|
222
|
+
cmd = inputs.get('command', '')
|
|
223
|
+
if cmd:
|
|
224
|
+
# Extract command name
|
|
225
|
+
cmd_parts = cmd.split()
|
|
226
|
+
cmd_name = cmd_parts[0] if cmd_parts else 'unknown'
|
|
227
|
+
|
|
228
|
+
usage_dict['bash_commands'].append({
|
|
229
|
+
'command': cmd_name,
|
|
230
|
+
'description': inputs.get('description', '')[:100]
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
# Add concepts based on commands
|
|
234
|
+
if 'docker' in cmd.lower():
|
|
235
|
+
usage_dict['concepts'].add('docker')
|
|
236
|
+
if 'git' in cmd.lower():
|
|
237
|
+
usage_dict['concepts'].add('git')
|
|
238
|
+
if 'test' in cmd.lower() or 'pytest' in cmd.lower():
|
|
239
|
+
usage_dict['concepts'].add('testing')
|
|
240
|
+
|
|
241
|
+
elif tool_name == 'Glob':
|
|
242
|
+
pattern = inputs.get('pattern')
|
|
243
|
+
if pattern:
|
|
244
|
+
usage_dict['glob_patterns'].append({
|
|
245
|
+
'pattern': pattern,
|
|
246
|
+
'path': normalize_path(inputs.get('path', '.'))
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
elif tool_name == 'Task':
|
|
250
|
+
usage_dict['task_calls'].append({
|
|
251
|
+
'description': inputs.get('description', '')[:100],
|
|
252
|
+
'subagent_type': inputs.get('subagent_type')
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
# Handle MCP tools
|
|
256
|
+
elif tool_name and tool_name.startswith('mcp__'):
|
|
257
|
+
usage_dict['mcp_calls'].append({
|
|
258
|
+
'tool': tool_name,
|
|
259
|
+
'params': list(inputs.keys()) if inputs else []
|
|
260
|
+
})
|
|
261
|
+
usage_dict['concepts'].add('mcp')
|
|
262
|
+
|
|
263
|
+
def create_enhanced_chunk(messages: List[Dict], chunk_index: int, tool_usage: Dict[str, Any],
|
|
264
|
+
conversation_metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
265
|
+
"""Create chunk with tool usage metadata."""
|
|
266
|
+
# Extract text from messages
|
|
267
|
+
chunk_text = "\n\n".join([
|
|
268
|
+
f"{msg['role'].upper()}: {msg['content']}"
|
|
269
|
+
for msg in messages
|
|
270
|
+
])
|
|
271
|
+
|
|
272
|
+
# Extract concepts from chunk text and tool usage
|
|
273
|
+
concepts = extract_concepts(chunk_text, tool_usage)
|
|
274
|
+
|
|
275
|
+
# Deduplicate and clean file paths
|
|
276
|
+
all_file_items = tool_usage.get('files_read', []) + tool_usage.get('files_edited', [])
|
|
277
|
+
files_analyzed = list(set([
|
|
278
|
+
item['path'] if isinstance(item, dict) else item
|
|
279
|
+
for item in all_file_items
|
|
280
|
+
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
281
|
+
]))[:20] # Limit to 20 files
|
|
282
|
+
|
|
283
|
+
files_edited = list(set([
|
|
284
|
+
item['path'] if isinstance(item, dict) else item
|
|
285
|
+
for item in tool_usage.get('files_edited', [])
|
|
286
|
+
if (isinstance(item, dict) and item.get('path')) or isinstance(item, str)
|
|
287
|
+
]))[:10] # Limit to 10 files
|
|
288
|
+
|
|
289
|
+
# Build enhanced chunk
|
|
290
|
+
chunk = {
|
|
291
|
+
"text": chunk_text,
|
|
292
|
+
"conversation_id": conversation_metadata['id'],
|
|
293
|
+
"chunk_index": chunk_index,
|
|
294
|
+
"timestamp": conversation_metadata['timestamp'],
|
|
295
|
+
"project": conversation_metadata['project'],
|
|
296
|
+
"start_role": messages[0]['role'] if messages else 'unknown',
|
|
297
|
+
|
|
298
|
+
# Tool usage metadata
|
|
299
|
+
"files_analyzed": files_analyzed,
|
|
300
|
+
"files_edited": files_edited,
|
|
301
|
+
"search_patterns": [s['pattern'] for s in tool_usage.get('grep_searches', [])][:10],
|
|
302
|
+
"concepts": list(concepts)[:15],
|
|
303
|
+
"tool_summary": dict(list(tool_usage.get('tools_summary', {}).items())[:10]),
|
|
304
|
+
"analysis_only": len(tool_usage.get('files_edited', [])) == 0 and len(tool_usage.get('files_created', [])) == 0,
|
|
305
|
+
|
|
306
|
+
# Additional context
|
|
307
|
+
"commands_used": list(set([c['command'] for c in tool_usage.get('bash_commands', [])]))[:10],
|
|
308
|
+
"has_security_check": 'security' in concepts,
|
|
309
|
+
"has_performance_check": 'performance' in concepts,
|
|
310
|
+
"mcp_tools_used": list(set([m['tool'].split('__')[1] if '__' in m['tool'] else m['tool']
|
|
311
|
+
for m in tool_usage.get('mcp_calls', [])]))[:5]
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return chunk
|
|
315
|
+
|
|
316
|
+
# Import state management functions (same as original)
|
|
317
|
+
def load_state():
|
|
318
|
+
"""Load the import state from file."""
|
|
319
|
+
if os.path.exists(STATE_FILE):
|
|
320
|
+
try:
|
|
321
|
+
with open(STATE_FILE, 'r') as f:
|
|
322
|
+
state = json.load(f)
|
|
323
|
+
if "imported_files" not in state:
|
|
324
|
+
state["imported_files"] = {}
|
|
325
|
+
return state
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.warning(f"Failed to load state file: {e}")
|
|
328
|
+
return {"imported_files": {}}
|
|
329
|
+
|
|
330
|
+
def save_state(state):
|
|
331
|
+
"""Save the import state to file."""
|
|
332
|
+
try:
|
|
333
|
+
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
334
|
+
temp_file = STATE_FILE + ".tmp"
|
|
335
|
+
with open(temp_file, 'w') as f:
|
|
336
|
+
json.dump(state, f, indent=2)
|
|
337
|
+
os.replace(temp_file, STATE_FILE)
|
|
338
|
+
logger.debug(f"Saved state with {len(state['imported_files'])} files")
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.error(f"Failed to save state file: {e}")
|
|
341
|
+
|
|
342
|
+
def should_import_file(file_path, state):
|
|
343
|
+
"""Check if a file should be imported based on modification time."""
|
|
344
|
+
str_path = str(file_path)
|
|
345
|
+
file_mtime = os.path.getmtime(file_path)
|
|
346
|
+
|
|
347
|
+
if str_path in state["imported_files"]:
|
|
348
|
+
last_imported = state["imported_files"][str_path].get("last_imported", 0)
|
|
349
|
+
last_modified = state["imported_files"][str_path].get("last_modified", 0)
|
|
350
|
+
|
|
351
|
+
if file_mtime <= last_modified and last_imported > 0:
|
|
352
|
+
logger.info(f"Skipping unchanged file: {file_path.name}")
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
def update_file_state(file_path, state, chunks_imported, tool_stats=None):
|
|
358
|
+
"""Update the state for an imported file with tool usage stats."""
|
|
359
|
+
str_path = str(file_path)
|
|
360
|
+
state["imported_files"][str_path] = {
|
|
361
|
+
"last_modified": os.path.getmtime(file_path),
|
|
362
|
+
"last_imported": datetime.now().timestamp(),
|
|
363
|
+
"chunks_imported": chunks_imported,
|
|
364
|
+
"tool_stats": tool_stats or {}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
# Initialize embedding provider
|
|
368
|
+
embedding_provider = None
|
|
369
|
+
embedding_dimension = None
|
|
370
|
+
collection_suffix = None
|
|
371
|
+
|
|
372
|
+
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
373
|
+
logger.info("Using local FastEmbed embeddings")
|
|
374
|
+
from fastembed import TextEmbedding
|
|
375
|
+
embedding_provider = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
376
|
+
embedding_dimension = 384
|
|
377
|
+
collection_suffix = "_local"
|
|
378
|
+
else:
|
|
379
|
+
logger.info("Using Voyage AI embeddings")
|
|
380
|
+
import voyageai
|
|
381
|
+
vo = voyageai.Client(api_key=VOYAGE_API_KEY)
|
|
382
|
+
embedding_provider = vo
|
|
383
|
+
embedding_dimension = 1024
|
|
384
|
+
collection_suffix = "_voyage"
|
|
385
|
+
|
|
386
|
+
# Initialize Qdrant client
|
|
387
|
+
client = QdrantClient(url=QDRANT_URL)
|
|
388
|
+
|
|
389
|
+
def chunk_conversation(messages: List[Dict], chunk_size: int = 10) -> List[Dict]:
|
|
390
|
+
"""Split conversation into chunks of messages."""
|
|
391
|
+
chunks = []
|
|
392
|
+
for i in range(0, len(messages), chunk_size):
|
|
393
|
+
chunk_messages = messages[i:i + chunk_size]
|
|
394
|
+
chunks.append({
|
|
395
|
+
"messages": chunk_messages,
|
|
396
|
+
"chunk_index": i // chunk_size
|
|
397
|
+
})
|
|
398
|
+
return chunks
|
|
399
|
+
|
|
400
|
+
@retry(stop=stop_after_attempt(3), wait=wait_random_exponential(min=1, max=20))
|
|
401
|
+
def generate_embeddings(texts: List[str]) -> List[List[float]]:
|
|
402
|
+
"""Generate embeddings for texts with retry logic."""
|
|
403
|
+
if PREFER_LOCAL_EMBEDDINGS or not VOYAGE_API_KEY:
|
|
404
|
+
embeddings = list(embedding_provider.embed(texts))
|
|
405
|
+
return [emb.tolist() if hasattr(emb, 'tolist') else emb for emb in embeddings]
|
|
406
|
+
else:
|
|
407
|
+
result = embedding_provider.embed(texts, model="voyage-3", input_type="document")
|
|
408
|
+
return result.embeddings
|
|
409
|
+
|
|
410
|
+
def import_project(project_path: Path, state: Dict) -> int:
|
|
411
|
+
"""Import conversations from a single project with tool usage extraction."""
|
|
412
|
+
total_chunks = 0
|
|
413
|
+
jsonl_files = list(project_path.glob("*.jsonl"))
|
|
414
|
+
|
|
415
|
+
if not jsonl_files:
|
|
416
|
+
return 0
|
|
417
|
+
|
|
418
|
+
# Create or verify collection
|
|
419
|
+
collection_name = f"conv_{hashlib.md5(project_path.name.encode()).hexdigest()[:8]}{collection_suffix}"
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
collections = [c.name for c in client.get_collections().collections]
|
|
423
|
+
if collection_name not in collections:
|
|
424
|
+
client.create_collection(
|
|
425
|
+
collection_name=collection_name,
|
|
426
|
+
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
|
427
|
+
)
|
|
428
|
+
logger.info(f"Created collection: {collection_name}")
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.error(f"Failed to create/verify collection {collection_name}: {e}")
|
|
431
|
+
return 0
|
|
432
|
+
|
|
433
|
+
for jsonl_file in jsonl_files:
|
|
434
|
+
if not should_import_file(jsonl_file, state):
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
logger.info(f"Processing file: {jsonl_file.name}")
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
file_start_time = time.time()
|
|
441
|
+
|
|
442
|
+
# Extract tool usage
|
|
443
|
+
extract_start = time.time()
|
|
444
|
+
tool_usage = extract_tool_usage_from_jsonl(str(jsonl_file))
|
|
445
|
+
extract_time = time.time() - extract_start
|
|
446
|
+
timing_stats["extract"].append(extract_time)
|
|
447
|
+
|
|
448
|
+
# Read and process messages (original logic)
|
|
449
|
+
messages = []
|
|
450
|
+
created_at = None
|
|
451
|
+
|
|
452
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
453
|
+
for line_num, line in enumerate(f, 1):
|
|
454
|
+
line = line.strip()
|
|
455
|
+
if not line:
|
|
456
|
+
continue
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
data = json.loads(line)
|
|
460
|
+
|
|
461
|
+
if created_at is None and 'timestamp' in data:
|
|
462
|
+
created_at = data.get('timestamp')
|
|
463
|
+
|
|
464
|
+
if data.get('type') == 'summary':
|
|
465
|
+
continue
|
|
466
|
+
|
|
467
|
+
if 'message' in data and data['message']:
|
|
468
|
+
msg = data['message']
|
|
469
|
+
if msg.get('role') and msg.get('content'):
|
|
470
|
+
content = msg['content']
|
|
471
|
+
if isinstance(content, list):
|
|
472
|
+
text_parts = []
|
|
473
|
+
for item in content:
|
|
474
|
+
if isinstance(item, dict) and item.get('type') == 'text':
|
|
475
|
+
text_parts.append(item.get('text', ''))
|
|
476
|
+
elif isinstance(item, str):
|
|
477
|
+
text_parts.append(item)
|
|
478
|
+
content = '\n'.join(text_parts)
|
|
479
|
+
|
|
480
|
+
if content:
|
|
481
|
+
messages.append({
|
|
482
|
+
'role': msg['role'],
|
|
483
|
+
'content': content
|
|
484
|
+
})
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.error(f"Error processing line {line_num}: {e}")
|
|
487
|
+
|
|
488
|
+
if not messages:
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
# Prepare metadata
|
|
492
|
+
if created_at is None:
|
|
493
|
+
created_at = datetime.now().isoformat()
|
|
494
|
+
conversation_id = jsonl_file.stem
|
|
495
|
+
|
|
496
|
+
conversation_metadata = {
|
|
497
|
+
'id': conversation_id,
|
|
498
|
+
'timestamp': created_at,
|
|
499
|
+
'project': project_path.name
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
# Chunk the conversation
|
|
503
|
+
chunk_start = time.time()
|
|
504
|
+
chunks_data = chunk_conversation(messages)
|
|
505
|
+
enhanced_chunks = []
|
|
506
|
+
|
|
507
|
+
for chunk_data in chunks_data:
|
|
508
|
+
enhanced_chunk = create_enhanced_chunk(
|
|
509
|
+
chunk_data["messages"],
|
|
510
|
+
chunk_data["chunk_index"],
|
|
511
|
+
tool_usage,
|
|
512
|
+
conversation_metadata
|
|
513
|
+
)
|
|
514
|
+
enhanced_chunks.append(enhanced_chunk)
|
|
515
|
+
|
|
516
|
+
chunk_time = time.time() - chunk_start
|
|
517
|
+
timing_stats["chunk"].append(chunk_time)
|
|
518
|
+
|
|
519
|
+
if not enhanced_chunks:
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
# Process in batches
|
|
523
|
+
for batch_start in range(0, len(enhanced_chunks), BATCH_SIZE):
|
|
524
|
+
batch = enhanced_chunks[batch_start:batch_start + BATCH_SIZE]
|
|
525
|
+
texts = [chunk["text"] for chunk in batch]
|
|
526
|
+
|
|
527
|
+
# Generate embeddings
|
|
528
|
+
embed_start = time.time()
|
|
529
|
+
embeddings = generate_embeddings(texts)
|
|
530
|
+
embed_time = time.time() - embed_start
|
|
531
|
+
timing_stats["embed"].append(embed_time)
|
|
532
|
+
|
|
533
|
+
# Create points
|
|
534
|
+
points = []
|
|
535
|
+
for chunk, embedding in zip(batch, embeddings):
|
|
536
|
+
point_id = hashlib.md5(
|
|
537
|
+
f"{conversation_id}_{chunk['chunk_index']}".encode()
|
|
538
|
+
).hexdigest()[:16]
|
|
539
|
+
|
|
540
|
+
points.append(PointStruct(
|
|
541
|
+
id=int(point_id, 16) % (2**63),
|
|
542
|
+
vector=embedding,
|
|
543
|
+
payload=chunk
|
|
544
|
+
))
|
|
545
|
+
|
|
546
|
+
# Upload to Qdrant (unless dry run)
|
|
547
|
+
if not DRY_RUN:
|
|
548
|
+
store_start = time.time()
|
|
549
|
+
client.upsert(
|
|
550
|
+
collection_name=collection_name,
|
|
551
|
+
points=points
|
|
552
|
+
)
|
|
553
|
+
store_time = time.time() - store_start
|
|
554
|
+
timing_stats["store"].append(store_time)
|
|
555
|
+
else:
|
|
556
|
+
logger.info(f"[DRY RUN] Would upload {len(points)} points to {collection_name}")
|
|
557
|
+
|
|
558
|
+
total_chunks += len(points)
|
|
559
|
+
|
|
560
|
+
file_chunks = len(enhanced_chunks)
|
|
561
|
+
total_time = time.time() - file_start_time
|
|
562
|
+
timing_stats["total"].append(total_time)
|
|
563
|
+
|
|
564
|
+
logger.info(f"Imported {file_chunks} chunks from {jsonl_file.name} "
|
|
565
|
+
f"(extract: {extract_time:.2f}s, chunk: {chunk_time:.2f}s, total: {total_time:.2f}s)")
|
|
566
|
+
|
|
567
|
+
# Update state with tool stats
|
|
568
|
+
tool_stats = {
|
|
569
|
+
"tools_used": list(tool_usage['tools_summary'].keys()),
|
|
570
|
+
"files_analyzed": len(enhanced_chunks[0].get('files_analyzed', [])) if enhanced_chunks else 0,
|
|
571
|
+
"concepts": list(tool_usage.get('concepts', []))[:10]
|
|
572
|
+
}
|
|
573
|
+
update_file_state(jsonl_file, state, file_chunks, tool_stats)
|
|
574
|
+
|
|
575
|
+
# Save state after each file
|
|
576
|
+
if not DRY_RUN:
|
|
577
|
+
save_state(state)
|
|
578
|
+
|
|
579
|
+
gc.collect()
|
|
580
|
+
|
|
581
|
+
except Exception as e:
|
|
582
|
+
logger.error(f"Failed to import {jsonl_file}: {e}")
|
|
583
|
+
import traceback
|
|
584
|
+
logger.error(traceback.format_exc())
|
|
585
|
+
|
|
586
|
+
return total_chunks
|
|
587
|
+
|
|
588
|
+
def main():
|
|
589
|
+
"""Main import function with enhanced features."""
|
|
590
|
+
import argparse
|
|
591
|
+
|
|
592
|
+
parser = argparse.ArgumentParser(description='Import conversations with tool usage extraction')
|
|
593
|
+
parser.add_argument('--days', type=int, help='Import only files from last N days')
|
|
594
|
+
parser.add_argument('--limit', type=int, help='Limit number of files to import')
|
|
595
|
+
parser.add_argument('--dry-run', action='store_true', help='Run without actually importing')
|
|
596
|
+
parser.add_argument('--project', type=str, help='Import only specific project')
|
|
597
|
+
|
|
598
|
+
args = parser.parse_args()
|
|
599
|
+
|
|
600
|
+
if args.dry_run:
|
|
601
|
+
global DRY_RUN
|
|
602
|
+
DRY_RUN = True
|
|
603
|
+
logger.info("Running in DRY RUN mode - no data will be imported")
|
|
604
|
+
|
|
605
|
+
logs_path = Path(LOGS_DIR)
|
|
606
|
+
|
|
607
|
+
# Handle local development vs Docker paths
|
|
608
|
+
if not logs_path.exists():
|
|
609
|
+
# Try local development path
|
|
610
|
+
home_logs = Path.home() / '.claude' / 'projects'
|
|
611
|
+
if home_logs.exists():
|
|
612
|
+
logs_path = home_logs
|
|
613
|
+
logger.info(f"Using local logs directory: {logs_path}")
|
|
614
|
+
else:
|
|
615
|
+
logger.error(f"Logs directory not found: {LOGS_DIR}")
|
|
616
|
+
return
|
|
617
|
+
|
|
618
|
+
# Load existing state
|
|
619
|
+
state = load_state()
|
|
620
|
+
logger.info(f"Loaded state with {len(state['imported_files'])} previously imported files")
|
|
621
|
+
|
|
622
|
+
# Find project directories
|
|
623
|
+
if args.project:
|
|
624
|
+
project_dirs = [d for d in logs_path.iterdir() if d.is_dir() and args.project in d.name]
|
|
625
|
+
else:
|
|
626
|
+
project_dirs = [d for d in logs_path.iterdir() if d.is_dir()]
|
|
627
|
+
|
|
628
|
+
if not project_dirs:
|
|
629
|
+
logger.warning("No project directories found")
|
|
630
|
+
return
|
|
631
|
+
|
|
632
|
+
# Filter by date if specified
|
|
633
|
+
if args.days:
|
|
634
|
+
cutoff_date = datetime.now() - timedelta(days=args.days)
|
|
635
|
+
filtered_dirs = []
|
|
636
|
+
for project_dir in project_dirs:
|
|
637
|
+
jsonl_files = list(project_dir.glob("*.jsonl"))
|
|
638
|
+
recent_files = [f for f in jsonl_files if datetime.fromtimestamp(f.stat().st_mtime) > cutoff_date]
|
|
639
|
+
if recent_files:
|
|
640
|
+
filtered_dirs.append(project_dir)
|
|
641
|
+
project_dirs = filtered_dirs
|
|
642
|
+
logger.info(f"Filtered to {len(project_dirs)} projects with files from last {args.days} days")
|
|
643
|
+
|
|
644
|
+
# Apply limit if specified
|
|
645
|
+
if args.limit:
|
|
646
|
+
project_dirs = project_dirs[:args.limit]
|
|
647
|
+
|
|
648
|
+
logger.info(f"Found {len(project_dirs)} projects to import")
|
|
649
|
+
|
|
650
|
+
# Import each project
|
|
651
|
+
total_imported = 0
|
|
652
|
+
for project_dir in project_dirs:
|
|
653
|
+
logger.info(f"Importing project: {project_dir.name}")
|
|
654
|
+
chunks = import_project(project_dir, state)
|
|
655
|
+
total_imported += chunks
|
|
656
|
+
|
|
657
|
+
# Print timing statistics
|
|
658
|
+
logger.info("\n=== Import Performance Summary ===")
|
|
659
|
+
logger.info(f"Total chunks imported: {total_imported}")
|
|
660
|
+
|
|
661
|
+
if timing_stats["total"]:
|
|
662
|
+
logger.info(f"\nTiming averages:")
|
|
663
|
+
logger.info(f" Extract: {sum(timing_stats['extract'])/len(timing_stats['extract']):.2f}s")
|
|
664
|
+
logger.info(f" Chunk: {sum(timing_stats['chunk'])/len(timing_stats['chunk']):.2f}s")
|
|
665
|
+
if timing_stats['embed']:
|
|
666
|
+
logger.info(f" Embed: {sum(timing_stats['embed'])/len(timing_stats['embed']):.2f}s")
|
|
667
|
+
if timing_stats['store']:
|
|
668
|
+
logger.info(f" Store: {sum(timing_stats['store'])/len(timing_stats['store']):.2f}s")
|
|
669
|
+
logger.info(f" Total: {sum(timing_stats['total'])/len(timing_stats['total']):.2f}s per file")
|
|
670
|
+
|
|
671
|
+
if __name__ == "__main__":
|
|
672
|
+
main()
|
|
@@ -33,7 +33,9 @@ from tenacity import (
|
|
|
33
33
|
# Configuration
|
|
34
34
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
35
35
|
LOGS_DIR = os.getenv("LOGS_DIR", "/logs")
|
|
36
|
-
|
|
36
|
+
# Default to project config directory for state file
|
|
37
|
+
default_state_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "imported-files.json")
|
|
38
|
+
STATE_FILE = os.getenv("STATE_FILE", default_state_file)
|
|
37
39
|
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) # Reduced from 100 to prevent OOM
|
|
38
40
|
PREFER_LOCAL_EMBEDDINGS = os.getenv("PREFER_LOCAL_EMBEDDINGS", "false").lower() == "true"
|
|
39
41
|
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")
|
|
@@ -1,33 +1,88 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
2
|
+
"""Enhanced watcher that runs import periodically and supports manual triggers."""
|
|
3
3
|
|
|
4
4
|
import time
|
|
5
5
|
import subprocess
|
|
6
6
|
import os
|
|
7
7
|
import sys
|
|
8
8
|
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
9
10
|
|
|
10
11
|
WATCH_INTERVAL = int(os.getenv('WATCH_INTERVAL', '60'))
|
|
12
|
+
SIGNAL_FILE = Path("/tmp/claude-self-reflect-import-current")
|
|
13
|
+
CHECK_INTERVAL = 1 # Check for signal file every second
|
|
11
14
|
|
|
12
|
-
print(f"[Watcher] Starting import watcher with {WATCH_INTERVAL}s interval", flush=True)
|
|
15
|
+
print(f"[Watcher] Starting enhanced import watcher with {WATCH_INTERVAL}s interval", flush=True)
|
|
16
|
+
print(f"[Watcher] Monitoring signal file: {SIGNAL_FILE}", flush=True)
|
|
17
|
+
|
|
18
|
+
last_import = 0
|
|
13
19
|
|
|
14
20
|
while True:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
21
|
+
current_time = time.time()
|
|
22
|
+
|
|
23
|
+
# Check for manual trigger signal
|
|
24
|
+
if SIGNAL_FILE.exists():
|
|
25
|
+
print(f"[Watcher] Signal detected! Running immediate import...", flush=True)
|
|
26
|
+
try:
|
|
27
|
+
# Read conversation ID if provided
|
|
28
|
+
conversation_id = None
|
|
29
|
+
try:
|
|
30
|
+
conversation_id = SIGNAL_FILE.read_text().strip()
|
|
31
|
+
except:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
# Remove signal file to prevent re-triggering
|
|
35
|
+
SIGNAL_FILE.unlink()
|
|
36
|
+
|
|
37
|
+
# Run import with special flag for current conversation only
|
|
38
|
+
cmd = [sys.executable, "/scripts/import-conversations-unified.py"]
|
|
39
|
+
if conversation_id:
|
|
40
|
+
cmd.extend(["--conversation-id", conversation_id])
|
|
41
|
+
else:
|
|
42
|
+
# Import only today's conversations for manual trigger
|
|
43
|
+
cmd.extend(["--days", "1"])
|
|
44
|
+
|
|
45
|
+
# Write progress indicator
|
|
46
|
+
progress_file = Path("/tmp/claude-self-reflect-import-progress")
|
|
47
|
+
progress_file.write_text("🔄 Starting import...")
|
|
48
|
+
|
|
49
|
+
print(f"[Watcher] Running command: {' '.join(cmd)}", flush=True)
|
|
50
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
51
|
+
|
|
52
|
+
if result.returncode == 0:
|
|
53
|
+
print(f"[Watcher] Manual import completed successfully", flush=True)
|
|
54
|
+
# Create completion signal
|
|
55
|
+
Path("/tmp/claude-self-reflect-import-complete").touch()
|
|
56
|
+
else:
|
|
57
|
+
print(f"[Watcher] Manual import failed with code {result.returncode}", flush=True)
|
|
58
|
+
if result.stderr:
|
|
59
|
+
print(f"[Watcher] Error: {result.stderr}", flush=True)
|
|
60
|
+
|
|
61
|
+
last_import = current_time
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
print(f"[Watcher] Error during manual import: {e}", flush=True)
|
|
65
|
+
|
|
66
|
+
# Regular scheduled import
|
|
67
|
+
elif current_time - last_import >= WATCH_INTERVAL:
|
|
68
|
+
try:
|
|
69
|
+
print(f"[Watcher] Running scheduled import at {datetime.now().isoformat()}", flush=True)
|
|
70
|
+
result = subprocess.run([
|
|
71
|
+
sys.executable,
|
|
72
|
+
"/scripts/import-conversations-unified.py"
|
|
73
|
+
], capture_output=True, text=True)
|
|
74
|
+
|
|
75
|
+
if result.returncode == 0:
|
|
76
|
+
print(f"[Watcher] Scheduled import completed successfully", flush=True)
|
|
77
|
+
else:
|
|
78
|
+
print(f"[Watcher] Scheduled import failed with code {result.returncode}", flush=True)
|
|
79
|
+
if result.stderr:
|
|
80
|
+
print(f"[Watcher] Error: {result.stderr}", flush=True)
|
|
81
|
+
|
|
82
|
+
last_import = current_time
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
print(f"[Watcher] Error during scheduled import: {e}", flush=True)
|
|
31
86
|
|
|
32
|
-
|
|
33
|
-
time.sleep(
|
|
87
|
+
# Short sleep to check for signals frequently
|
|
88
|
+
time.sleep(CHECK_INTERVAL)
|