claude-jacked 0.2.3__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_jacked-0.2.9.dist-info/METADATA +523 -0
- claude_jacked-0.2.9.dist-info/RECORD +33 -0
- jacked/cli.py +752 -47
- jacked/client.py +196 -29
- jacked/data/agents/code-simplicity-reviewer.md +87 -0
- jacked/data/agents/defensive-error-handler.md +93 -0
- jacked/data/agents/double-check-reviewer.md +214 -0
- jacked/data/agents/git-pr-workflow-manager.md +149 -0
- jacked/data/agents/issue-pr-coordinator.md +131 -0
- jacked/data/agents/pr-workflow-checker.md +199 -0
- jacked/data/agents/readme-maintainer.md +123 -0
- jacked/data/agents/test-coverage-engineer.md +155 -0
- jacked/data/agents/test-coverage-improver.md +139 -0
- jacked/data/agents/wiki-documentation-architect.md +580 -0
- jacked/data/commands/audit-rules.md +103 -0
- jacked/data/commands/dc.md +155 -0
- jacked/data/commands/learn.md +89 -0
- jacked/data/commands/pr.md +4 -0
- jacked/data/commands/redo.md +85 -0
- jacked/data/commands/techdebt.md +115 -0
- jacked/data/prompts/security_gatekeeper.txt +58 -0
- jacked/data/rules/jacked_behaviors.md +11 -0
- jacked/data/skills/jacked/SKILL.md +162 -0
- jacked/index_write_tracker.py +227 -0
- jacked/indexer.py +255 -129
- jacked/retriever.py +389 -137
- jacked/searcher.py +65 -13
- jacked/transcript.py +339 -0
- claude_jacked-0.2.3.dist-info/METADATA +0 -483
- claude_jacked-0.2.3.dist-info/RECORD +0 -13
- {claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/WHEEL +0 -0
- {claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/entry_points.txt +0 -0
- {claude_jacked-0.2.3.dist-info → claude_jacked-0.2.9.dist-info}/licenses/LICENSE +0 -0
jacked/indexer.py
CHANGED
|
@@ -2,6 +2,13 @@
|
|
|
2
2
|
Session indexing for Jacked.
|
|
3
3
|
|
|
4
4
|
Handles parsing Claude sessions and upserting to Qdrant with server-side embedding.
|
|
5
|
+
|
|
6
|
+
Content types indexed:
|
|
7
|
+
- plan: Full implementation strategy from ~/.claude/plans/{slug}.md
|
|
8
|
+
- subagent_summary: Rich summaries from subagent outputs
|
|
9
|
+
- summary_label: Tiny chapter titles from compaction events
|
|
10
|
+
- user_message: First few user messages for intent matching
|
|
11
|
+
- chunk: Full transcript chunks for full retrieval mode
|
|
5
12
|
"""
|
|
6
13
|
|
|
7
14
|
import logging
|
|
@@ -20,11 +27,11 @@ from jacked.config import (
|
|
|
20
27
|
)
|
|
21
28
|
from jacked.client import QdrantSessionClient, INFERENCE_MODEL
|
|
22
29
|
from jacked.transcript import (
|
|
23
|
-
|
|
30
|
+
parse_jsonl_file_enriched,
|
|
24
31
|
chunk_text,
|
|
25
|
-
|
|
26
|
-
ParsedTranscript,
|
|
32
|
+
EnrichedTranscript,
|
|
27
33
|
)
|
|
34
|
+
from jacked.index_write_tracker import IndexWriteTracker
|
|
28
35
|
|
|
29
36
|
|
|
30
37
|
logger = logging.getLogger(__name__)
|
|
@@ -34,9 +41,12 @@ class SessionIndexer:
|
|
|
34
41
|
"""
|
|
35
42
|
Indexes Claude sessions to Qdrant using server-side embedding.
|
|
36
43
|
|
|
37
|
-
Creates
|
|
38
|
-
-
|
|
39
|
-
-
|
|
44
|
+
Creates multiple content types for each session:
|
|
45
|
+
- plan: Full implementation strategy (gold - highest priority)
|
|
46
|
+
- subagent_summary: Rich summaries from agent outputs (gold)
|
|
47
|
+
- summary_label: Tiny chapter titles from compaction
|
|
48
|
+
- user_message: First few user messages for intent matching
|
|
49
|
+
- chunk: Full transcript chunks for full retrieval mode
|
|
40
50
|
|
|
41
51
|
Qdrant Cloud Inference handles all embedding server-side.
|
|
42
52
|
|
|
@@ -60,6 +70,10 @@ class SessionIndexer:
|
|
|
60
70
|
"""
|
|
61
71
|
self.config = config
|
|
62
72
|
self.client = client or QdrantSessionClient(config)
|
|
73
|
+
# Config hash for detecting chunk_size/overlap changes
|
|
74
|
+
self._config_hash = content_hash(f"{config.chunk_size}:{config.chunk_overlap}")
|
|
75
|
+
# Write tracker for incremental indexing (NOT for retrieval!)
|
|
76
|
+
self._tracker = IndexWriteTracker(self._config_hash)
|
|
63
77
|
|
|
64
78
|
def index_session(
|
|
65
79
|
self,
|
|
@@ -68,20 +82,24 @@ class SessionIndexer:
|
|
|
68
82
|
force: bool = False,
|
|
69
83
|
) -> dict:
|
|
70
84
|
"""
|
|
71
|
-
Index a single session to Qdrant.
|
|
85
|
+
Index a single session to Qdrant with incremental updates.
|
|
86
|
+
|
|
87
|
+
Uses local SQLite tracker to avoid re-pushing unchanged content.
|
|
88
|
+
Only indexes NEW or CHANGED points - much more efficient than
|
|
89
|
+
the old delete-all-and-replace approach.
|
|
72
90
|
|
|
73
91
|
Args:
|
|
74
92
|
session_path: Path to the .jsonl session file
|
|
75
93
|
repo_path: Full path to the repository
|
|
76
|
-
force: If True, re-
|
|
94
|
+
force: If True, clear tracker and re-seed from Qdrant
|
|
77
95
|
|
|
78
96
|
Returns:
|
|
79
97
|
Dict with indexing results:
|
|
80
98
|
- session_id: The session ID
|
|
81
|
-
- indexed: Whether
|
|
82
|
-
- skipped: Whether it was skipped (
|
|
83
|
-
-
|
|
84
|
-
-
|
|
99
|
+
- indexed: Whether new content was indexed
|
|
100
|
+
- skipped: Whether it was skipped (no new content)
|
|
101
|
+
- new_points: Number of new/changed points indexed
|
|
102
|
+
- plans, subagent_summaries, etc.: Counts by content type
|
|
85
103
|
- error: Error message if failed
|
|
86
104
|
|
|
87
105
|
Examples:
|
|
@@ -92,8 +110,12 @@ class SessionIndexer:
|
|
|
92
110
|
"session_id": session_path.stem,
|
|
93
111
|
"indexed": False,
|
|
94
112
|
"skipped": False,
|
|
95
|
-
"
|
|
96
|
-
"
|
|
113
|
+
"new_points": 0,
|
|
114
|
+
"plans": 0,
|
|
115
|
+
"subagent_summaries": 0,
|
|
116
|
+
"summary_labels": 0,
|
|
117
|
+
"user_messages": 0,
|
|
118
|
+
"chunks": 0,
|
|
97
119
|
"error": None,
|
|
98
120
|
}
|
|
99
121
|
|
|
@@ -101,46 +123,79 @@ class SessionIndexer:
|
|
|
101
123
|
# Ensure collection exists
|
|
102
124
|
self.client.ensure_collection()
|
|
103
125
|
|
|
104
|
-
# Parse the transcript
|
|
105
|
-
transcript =
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
126
|
+
# Parse the transcript with enriched data
|
|
127
|
+
transcript = parse_jsonl_file_enriched(session_path)
|
|
128
|
+
session_id = transcript.session_id
|
|
129
|
+
result["session_id"] = session_id
|
|
130
|
+
|
|
131
|
+
# Check session metadata from tracker
|
|
132
|
+
meta = self._tracker.get_session_meta(session_id)
|
|
133
|
+
|
|
134
|
+
# Config changed? Clear and re-seed from Qdrant
|
|
135
|
+
if meta and meta["config_hash"] != self._config_hash:
|
|
136
|
+
logger.info(f"Config changed for session {session_id}, re-seeding from Qdrant")
|
|
137
|
+
self._tracker.clear_session(session_id)
|
|
138
|
+
meta = None
|
|
139
|
+
|
|
140
|
+
# Previous crash mid-indexing? Force re-index
|
|
141
|
+
if meta and meta["status"] == "indexing":
|
|
142
|
+
logger.info(f"Session {session_id} was interrupted mid-index, forcing re-seed")
|
|
143
|
+
force = True
|
|
144
|
+
|
|
145
|
+
# Cache miss or force? Seed from Qdrant (source of truth, THIS USER ONLY)
|
|
146
|
+
if meta is None or force:
|
|
147
|
+
self._tracker.clear_session(session_id)
|
|
148
|
+
self._tracker.seed_from_qdrant(session_id, self.client, self.config.user_name)
|
|
149
|
+
|
|
150
|
+
# Get what's already indexed
|
|
151
|
+
indexed = self._tracker.get_session_state(session_id)
|
|
152
|
+
|
|
153
|
+
# Mark as indexing BEFORE doing work (crash safety)
|
|
154
|
+
self._tracker.mark_indexing(session_id)
|
|
155
|
+
|
|
156
|
+
# Build only NEW/CHANGED points
|
|
157
|
+
points_to_index, points_metadata = self._build_incremental_points(
|
|
158
|
+
transcript, repo_path, indexed
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if not points_to_index:
|
|
162
|
+
self._tracker.mark_complete(session_id)
|
|
163
|
+
result["skipped"] = True
|
|
164
|
+
logger.debug(f"Session {session_id}: no new content to index")
|
|
123
165
|
return result
|
|
124
166
|
|
|
125
|
-
#
|
|
126
|
-
self.client.
|
|
167
|
+
# Upsert to Qdrant (no delete needed - deterministic IDs handle overwrites)
|
|
168
|
+
self.client.upsert_points(points_to_index)
|
|
127
169
|
|
|
128
|
-
#
|
|
129
|
-
|
|
170
|
+
# Record what we indexed in tracker
|
|
171
|
+
for content_type, idx, hash_val, point_id in points_metadata:
|
|
172
|
+
self._tracker.record_indexed(session_id, content_type, idx, hash_val, str(point_id))
|
|
130
173
|
|
|
131
|
-
|
|
174
|
+
self._tracker.mark_complete(session_id)
|
|
175
|
+
|
|
176
|
+
# Count results by content_type
|
|
132
177
|
result["indexed"] = True
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if
|
|
136
|
-
result["
|
|
137
|
-
elif
|
|
138
|
-
result["
|
|
178
|
+
result["new_points"] = len(points_to_index)
|
|
179
|
+
for content_type, _, _, _ in points_metadata:
|
|
180
|
+
if content_type == "plan":
|
|
181
|
+
result["plans"] += 1
|
|
182
|
+
elif content_type == "subagent_summary":
|
|
183
|
+
result["subagent_summaries"] += 1
|
|
184
|
+
elif content_type == "summary_label":
|
|
185
|
+
result["summary_labels"] += 1
|
|
186
|
+
elif content_type == "user_message":
|
|
187
|
+
result["user_messages"] += 1
|
|
188
|
+
elif content_type == "chunk":
|
|
189
|
+
result["chunks"] += 1
|
|
139
190
|
|
|
140
191
|
logger.info(
|
|
141
|
-
f"Indexed session {
|
|
142
|
-
f"{result['
|
|
143
|
-
f"{result['
|
|
192
|
+
f"Indexed session {session_id}: "
|
|
193
|
+
f"{result['new_points']} new points ("
|
|
194
|
+
f"{result['plans']} plan, "
|
|
195
|
+
f"{result['subagent_summaries']} summaries, "
|
|
196
|
+
f"{result['summary_labels']} labels, "
|
|
197
|
+
f"{result['user_messages']} msgs, "
|
|
198
|
+
f"{result['chunks']} chunks)"
|
|
144
199
|
)
|
|
145
200
|
|
|
146
201
|
return result
|
|
@@ -150,41 +205,40 @@ class SessionIndexer:
|
|
|
150
205
|
result["error"] = str(e)
|
|
151
206
|
return result
|
|
152
207
|
|
|
153
|
-
def
|
|
154
|
-
"""
|
|
155
|
-
Get the content hash of an existing indexed session.
|
|
208
|
+
def _make_point_id(self, session_id: str, content_type: str, index: int) -> str:
|
|
209
|
+
"""Generate deterministic point ID.
|
|
156
210
|
|
|
157
211
|
Args:
|
|
158
|
-
session_id:
|
|
212
|
+
session_id: The session UUID
|
|
213
|
+
content_type: One of plan, subagent_summary, summary_label, user_message, chunk
|
|
214
|
+
index: Index within that content type
|
|
159
215
|
|
|
160
216
|
Returns:
|
|
161
|
-
|
|
217
|
+
UUID5 string for the point
|
|
162
218
|
"""
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
if point and point.payload:
|
|
167
|
-
return point.payload.get("content_hash")
|
|
168
|
-
return None
|
|
169
|
-
|
|
170
|
-
def _build_points(
|
|
219
|
+
return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{session_id}:{content_type}:{index}"))
|
|
220
|
+
|
|
221
|
+
def _build_incremental_points(
|
|
171
222
|
self,
|
|
172
|
-
transcript:
|
|
223
|
+
transcript: EnrichedTranscript,
|
|
173
224
|
repo_path: str,
|
|
174
|
-
|
|
225
|
+
indexed: dict,
|
|
226
|
+
) -> tuple[list[models.PointStruct], list[tuple]]:
|
|
175
227
|
"""
|
|
176
|
-
Build
|
|
177
|
-
|
|
178
|
-
Uses models.Document for server-side embedding via Qdrant Cloud Inference.
|
|
228
|
+
Build only NEW or CHANGED points by comparing against what's already indexed.
|
|
179
229
|
|
|
180
230
|
Args:
|
|
181
|
-
transcript:
|
|
231
|
+
transcript: EnrichedTranscript with all extracted data
|
|
182
232
|
repo_path: Full path to the repository
|
|
233
|
+
indexed: Dict mapping (content_type, index) -> content_hash from tracker
|
|
183
234
|
|
|
184
235
|
Returns:
|
|
185
|
-
|
|
236
|
+
Tuple of (points_to_index, points_metadata) where points_metadata is
|
|
237
|
+
a list of (content_type, index, content_hash, point_id) tuples
|
|
186
238
|
"""
|
|
187
|
-
|
|
239
|
+
points_to_index = []
|
|
240
|
+
points_metadata = [] # (content_type, index, hash, point_id)
|
|
241
|
+
|
|
188
242
|
repo_id = get_repo_id(repo_path)
|
|
189
243
|
repo_name = get_repo_name(repo_path)
|
|
190
244
|
full_hash = content_hash(transcript.full_text)
|
|
@@ -194,79 +248,151 @@ class SessionIndexer:
|
|
|
194
248
|
else datetime.now().isoformat()
|
|
195
249
|
)
|
|
196
250
|
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
251
|
+
# Base payload for all points
|
|
252
|
+
base_payload = {
|
|
253
|
+
"repo_id": repo_id,
|
|
254
|
+
"repo_name": repo_name,
|
|
255
|
+
"repo_path": repo_path,
|
|
256
|
+
"session_id": transcript.session_id,
|
|
257
|
+
"user_name": self.config.user_name,
|
|
258
|
+
"machine": self.config.machine_name,
|
|
259
|
+
"timestamp": timestamp_str,
|
|
260
|
+
"content_hash": full_hash,
|
|
261
|
+
"slug": transcript.slug,
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
# 1. Plan - check hash
|
|
265
|
+
if transcript.plan:
|
|
266
|
+
plan_hash = content_hash(transcript.plan.content)
|
|
267
|
+
if indexed.get(("plan", 0)) != plan_hash:
|
|
268
|
+
point_id = self._make_point_id(transcript.session_id, "plan", 0)
|
|
269
|
+
points_to_index.append(
|
|
270
|
+
models.PointStruct(
|
|
271
|
+
id=point_id,
|
|
272
|
+
vector=models.Document(
|
|
273
|
+
text=transcript.plan.content[:8000],
|
|
274
|
+
model=INFERENCE_MODEL,
|
|
275
|
+
),
|
|
276
|
+
payload={
|
|
277
|
+
**base_payload,
|
|
278
|
+
"type": "plan",
|
|
279
|
+
"content_type": "plan",
|
|
280
|
+
"content": transcript.plan.content,
|
|
281
|
+
"plan_path": str(transcript.plan.path),
|
|
282
|
+
"chunk_index": 0,
|
|
283
|
+
},
|
|
284
|
+
)
|
|
285
|
+
)
|
|
286
|
+
points_metadata.append(("plan", 0, plan_hash, point_id))
|
|
287
|
+
|
|
288
|
+
# 2. User messages - compare by content hash
|
|
289
|
+
max_user_messages = 5
|
|
290
|
+
for i, msg in enumerate(transcript.user_messages[:max_user_messages]):
|
|
291
|
+
if not msg.content or len(msg.content) < 20:
|
|
292
|
+
continue
|
|
293
|
+
msg_hash = content_hash(msg.content)
|
|
294
|
+
if indexed.get(("user_message", i)) != msg_hash:
|
|
295
|
+
point_id = self._make_point_id(transcript.session_id, "user_message", i)
|
|
296
|
+
points_to_index.append(
|
|
297
|
+
models.PointStruct(
|
|
298
|
+
id=point_id,
|
|
299
|
+
vector=models.Document(
|
|
300
|
+
text=msg.content[:2000],
|
|
301
|
+
model=INFERENCE_MODEL,
|
|
302
|
+
),
|
|
303
|
+
payload={
|
|
304
|
+
**base_payload,
|
|
305
|
+
"type": "user_message",
|
|
306
|
+
"content_type": "user_message",
|
|
307
|
+
"content": msg.content,
|
|
308
|
+
"chunk_index": i,
|
|
309
|
+
},
|
|
310
|
+
)
|
|
311
|
+
)
|
|
312
|
+
points_metadata.append(("user_message", i, msg_hash, point_id))
|
|
313
|
+
|
|
314
|
+
# 3. Agent summaries - compare by hash
|
|
315
|
+
for i, agent_summary in enumerate(transcript.agent_summaries):
|
|
316
|
+
summary_hash = content_hash(agent_summary.summary_text)
|
|
317
|
+
if indexed.get(("subagent_summary", i)) != summary_hash:
|
|
318
|
+
point_id = self._make_point_id(transcript.session_id, "subagent_summary", i)
|
|
319
|
+
points_to_index.append(
|
|
320
|
+
models.PointStruct(
|
|
321
|
+
id=point_id,
|
|
322
|
+
vector=models.Document(
|
|
323
|
+
text=agent_summary.summary_text[:8000],
|
|
324
|
+
model=INFERENCE_MODEL,
|
|
325
|
+
),
|
|
326
|
+
payload={
|
|
327
|
+
**base_payload,
|
|
328
|
+
"type": "subagent_summary",
|
|
329
|
+
"content_type": "subagent_summary",
|
|
330
|
+
"content": agent_summary.summary_text,
|
|
331
|
+
"agent_id": agent_summary.agent_id,
|
|
332
|
+
"agent_type": agent_summary.agent_type,
|
|
333
|
+
"chunk_index": i,
|
|
334
|
+
},
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
points_metadata.append(("subagent_summary", i, summary_hash, point_id))
|
|
338
|
+
|
|
339
|
+
# 4. Summary labels - compare by hash
|
|
340
|
+
for i, label in enumerate(transcript.summary_labels):
|
|
341
|
+
label_hash = content_hash(label.label)
|
|
342
|
+
if indexed.get(("summary_label", i)) != label_hash:
|
|
343
|
+
point_id = self._make_point_id(transcript.session_id, "summary_label", i)
|
|
344
|
+
points_to_index.append(
|
|
345
|
+
models.PointStruct(
|
|
346
|
+
id=point_id,
|
|
347
|
+
vector=models.Document(
|
|
348
|
+
text=label.label,
|
|
349
|
+
model=INFERENCE_MODEL,
|
|
350
|
+
),
|
|
351
|
+
payload={
|
|
352
|
+
**base_payload,
|
|
353
|
+
"type": "summary_label",
|
|
354
|
+
"content_type": "summary_label",
|
|
355
|
+
"content": label.label,
|
|
356
|
+
"leaf_uuid": label.leaf_uuid,
|
|
357
|
+
"chunk_index": i,
|
|
358
|
+
},
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
points_metadata.append(("summary_label", i, label_hash, point_id))
|
|
202
362
|
|
|
203
|
-
#
|
|
363
|
+
# 5. Chunks - compare by hash (handles boundary drift)
|
|
204
364
|
transcript_chunks = chunk_text(
|
|
205
365
|
transcript.full_text,
|
|
206
366
|
chunk_size=self.config.chunk_size,
|
|
207
367
|
overlap=self.config.chunk_overlap,
|
|
208
368
|
)
|
|
209
369
|
|
|
210
|
-
# Create intent points with Document for server-side embedding
|
|
211
|
-
for i, chunk in enumerate(intent_chunks):
|
|
212
|
-
if not chunk.strip():
|
|
213
|
-
continue
|
|
214
|
-
|
|
215
|
-
# Generate deterministic UUID from session_id + type + index
|
|
216
|
-
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{transcript.session_id}_intent_{i}"))
|
|
217
|
-
points.append(
|
|
218
|
-
models.PointStruct(
|
|
219
|
-
id=point_id,
|
|
220
|
-
vector=models.Document(
|
|
221
|
-
text=chunk,
|
|
222
|
-
model=INFERENCE_MODEL,
|
|
223
|
-
),
|
|
224
|
-
payload={
|
|
225
|
-
"type": "intent",
|
|
226
|
-
"repo_id": repo_id,
|
|
227
|
-
"repo_name": repo_name,
|
|
228
|
-
"repo_path": repo_path,
|
|
229
|
-
"session_id": transcript.session_id,
|
|
230
|
-
"user_name": self.config.user_name,
|
|
231
|
-
"machine": self.config.machine_name,
|
|
232
|
-
"timestamp": timestamp_str,
|
|
233
|
-
"content_hash": full_hash,
|
|
234
|
-
"intent_text": chunk,
|
|
235
|
-
"chunk_index": i,
|
|
236
|
-
"total_chunks": len(intent_chunks),
|
|
237
|
-
"transcript_chunk_count": len(transcript_chunks),
|
|
238
|
-
},
|
|
239
|
-
)
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
# Create transcript chunk points for retrieval
|
|
243
370
|
for i, chunk in enumerate(transcript_chunks):
|
|
244
371
|
if not chunk.strip():
|
|
245
372
|
continue
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
},
|
|
373
|
+
chunk_hash = content_hash(chunk)
|
|
374
|
+
if indexed.get(("chunk", i)) != chunk_hash:
|
|
375
|
+
point_id = self._make_point_id(transcript.session_id, "chunk", i)
|
|
376
|
+
points_to_index.append(
|
|
377
|
+
models.PointStruct(
|
|
378
|
+
id=point_id,
|
|
379
|
+
vector=models.Document(
|
|
380
|
+
text=chunk[:4000],
|
|
381
|
+
model=INFERENCE_MODEL,
|
|
382
|
+
),
|
|
383
|
+
payload={
|
|
384
|
+
**base_payload,
|
|
385
|
+
"type": "chunk",
|
|
386
|
+
"content_type": "chunk",
|
|
387
|
+
"content": chunk,
|
|
388
|
+
"chunk_index": i,
|
|
389
|
+
"total_chunks": len(transcript_chunks),
|
|
390
|
+
},
|
|
391
|
+
)
|
|
266
392
|
)
|
|
267
|
-
|
|
393
|
+
points_metadata.append(("chunk", i, chunk_hash, point_id))
|
|
268
394
|
|
|
269
|
-
return
|
|
395
|
+
return points_to_index, points_metadata
|
|
270
396
|
|
|
271
397
|
def index_all_sessions(
|
|
272
398
|
self,
|