autochunks 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. autochunk/__init__.py +9 -0
  2. autochunk/__main__.py +5 -0
  3. autochunk/adapters/__init__.py +3 -0
  4. autochunk/adapters/haystack.py +68 -0
  5. autochunk/adapters/langchain.py +81 -0
  6. autochunk/adapters/llamaindex.py +94 -0
  7. autochunk/autochunker.py +606 -0
  8. autochunk/chunkers/__init__.py +100 -0
  9. autochunk/chunkers/agentic.py +184 -0
  10. autochunk/chunkers/base.py +16 -0
  11. autochunk/chunkers/contextual_retrieval.py +151 -0
  12. autochunk/chunkers/fixed_length.py +110 -0
  13. autochunk/chunkers/html_section.py +225 -0
  14. autochunk/chunkers/hybrid_semantic_stat.py +199 -0
  15. autochunk/chunkers/layout_aware.py +192 -0
  16. autochunk/chunkers/parent_child.py +172 -0
  17. autochunk/chunkers/proposition.py +175 -0
  18. autochunk/chunkers/python_ast.py +248 -0
  19. autochunk/chunkers/recursive_character.py +215 -0
  20. autochunk/chunkers/semantic_local.py +140 -0
  21. autochunk/chunkers/sentence_aware.py +102 -0
  22. autochunk/cli.py +135 -0
  23. autochunk/config.py +76 -0
  24. autochunk/embedding/__init__.py +22 -0
  25. autochunk/embedding/adapter.py +14 -0
  26. autochunk/embedding/base.py +33 -0
  27. autochunk/embedding/hashing.py +42 -0
  28. autochunk/embedding/local.py +154 -0
  29. autochunk/embedding/ollama.py +66 -0
  30. autochunk/embedding/openai.py +62 -0
  31. autochunk/embedding/tokenizer.py +9 -0
  32. autochunk/enrichment/__init__.py +0 -0
  33. autochunk/enrichment/contextual.py +29 -0
  34. autochunk/eval/__init__.py +0 -0
  35. autochunk/eval/harness.py +177 -0
  36. autochunk/eval/metrics.py +27 -0
  37. autochunk/eval/ragas_eval.py +234 -0
  38. autochunk/eval/synthetic.py +104 -0
  39. autochunk/quality/__init__.py +31 -0
  40. autochunk/quality/deduplicator.py +326 -0
  41. autochunk/quality/overlap_optimizer.py +402 -0
  42. autochunk/quality/post_processor.py +245 -0
  43. autochunk/quality/scorer.py +459 -0
  44. autochunk/retrieval/__init__.py +0 -0
  45. autochunk/retrieval/in_memory.py +47 -0
  46. autochunk/retrieval/parent_child.py +4 -0
  47. autochunk/storage/__init__.py +0 -0
  48. autochunk/storage/cache.py +34 -0
  49. autochunk/storage/plan.py +40 -0
  50. autochunk/utils/__init__.py +0 -0
  51. autochunk/utils/hashing.py +8 -0
  52. autochunk/utils/io.py +176 -0
  53. autochunk/utils/logger.py +64 -0
  54. autochunk/utils/telemetry.py +44 -0
  55. autochunk/utils/text.py +199 -0
  56. autochunks-0.0.8.dist-info/METADATA +133 -0
  57. autochunks-0.0.8.dist-info/RECORD +61 -0
  58. autochunks-0.0.8.dist-info/WHEEL +5 -0
  59. autochunks-0.0.8.dist-info/entry_points.txt +2 -0
  60. autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
  61. autochunks-0.0.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,172 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Dict, Any, Optional
4
+ from .base import BaseChunker, Chunk
5
+ from .recursive_character import RecursiveCharacterChunker
6
+ from ..utils.text import count_tokens
7
+
8
+ class ParentChildChunker(BaseChunker):
9
+ """
10
+ Hierarchical (Small-to-Big) Parent-Child Chunker.
11
+
12
+ BEST-OF-BREED FEATURES:
13
+ 1. N-Level Hierarchy: Configurable depth (Document > Section > Paragraph > Sentence).
14
+ 2. Sibling References: Tracks prev_node_id and next_node_id for traversal.
15
+ 3. Parent Context: Stores parent text in metadata for rich LLM context.
16
+ 4. Child Overlap: Optional overlap between children for context continuity.
17
+ """
18
+ name = "parent_child"
19
+
20
+ def __init__(self,
21
+ chunk_sizes: List[int] = None,
22
+ overlap: int = 0,
23
+ track_siblings: bool = True):
24
+ """
25
+ Initialize the chunker.
26
+
27
+ Args:
28
+ chunk_sizes: List of sizes for each hierarchy level.
29
+ Default [2048, 512, 128] creates 3 levels: large -> medium -> small
30
+ overlap: Token overlap between sibling chunks at each level
31
+ track_siblings: If True, add prev/next references to metadata
32
+ """
33
+ self.chunk_sizes = chunk_sizes or [2048, 512, 128]
34
+ self.overlap = overlap
35
+ self.track_siblings = track_siblings
36
+
37
+ def chunk(self,
38
+ doc_id: str,
39
+ text: str,
40
+ parent_size: int = None,
41
+ child_size: int = None,
42
+ overlap: int = None,
43
+ return_all_levels: bool = False,
44
+ **params) -> List[Chunk]:
45
+ """
46
+ Create hierarchical chunks with parent-child relationships.
47
+
48
+ Args:
49
+ doc_id: Document identifier
50
+ text: Input text
51
+ parent_size: Override first level size (for backward compatibility)
52
+ child_size: Override last level size (for backward compatibility)
53
+ overlap: Override overlap setting
54
+ return_all_levels: If True, return chunks from all levels, not just leaves
55
+
56
+ Returns:
57
+ List of Chunk objects (leaf nodes by default, or all nodes if return_all_levels=True)
58
+ """
59
+ # Handle legacy 2-level params
60
+ if parent_size and child_size:
61
+ chunk_sizes = [parent_size, child_size]
62
+ else:
63
+ chunk_sizes = self.chunk_sizes
64
+
65
+ if overlap is None:
66
+ overlap = self.overlap
67
+
68
+ base_chunker = RecursiveCharacterChunker()
69
+
70
+ all_chunks = []
71
+
72
+ def _build_hierarchy(input_text: str,
73
+ level: int,
74
+ parent_info: Dict[str, Any],
75
+ node_path: str) -> List[Chunk]:
76
+ """
77
+ Recursively build chunk hierarchy.
78
+
79
+ Args:
80
+ input_text: Text to chunk
81
+ level: Current hierarchy level (0 = root)
82
+ parent_info: Info about parent chunk
83
+ node_path: Path identifier for this node
84
+
85
+ Returns:
86
+ List of chunks at this level (and below if return_all_levels)
87
+ """
88
+ if level >= len(chunk_sizes):
89
+ return []
90
+
91
+ current_size = chunk_sizes[level]
92
+ is_leaf = (level == len(chunk_sizes) - 1)
93
+
94
+ # Create chunks at this level
95
+ level_chunks = base_chunker.chunk(
96
+ doc_id=f"{doc_id}_L{level}",
97
+ text=input_text,
98
+ base_token_size=current_size,
99
+ overlap=overlap
100
+ )
101
+
102
+ result_chunks = []
103
+
104
+ for idx, chunk in enumerate(level_chunks):
105
+ chunk_id = f"{node_path}#L{level}#{idx}"
106
+
107
+ # Build metadata with parent info
108
+ meta = {
109
+ "chunk_index": idx,
110
+ "level": level,
111
+ "is_leaf": is_leaf,
112
+ "strategy": "parent_child",
113
+ "token_count": count_tokens(chunk.text)
114
+ }
115
+
116
+ # Add parent references
117
+ if parent_info:
118
+ meta["parent_id"] = parent_info.get("id")
119
+ meta["parent_text"] = parent_info.get("text", "")[:500] # Truncate for efficiency
120
+
121
+ # Add sibling references
122
+ if self.track_siblings:
123
+ if idx > 0:
124
+ meta["prev_sibling_id"] = f"{node_path}#L{level}#{idx - 1}"
125
+ if idx < len(level_chunks) - 1:
126
+ meta["next_sibling_id"] = f"{node_path}#L{level}#{idx + 1}"
127
+
128
+ node = Chunk(
129
+ id=chunk_id,
130
+ doc_id=doc_id,
131
+ text=chunk.text,
132
+ meta=meta
133
+ )
134
+
135
+ # Add to results based on return_all_levels setting
136
+ if return_all_levels or is_leaf:
137
+ result_chunks.append(node)
138
+
139
+ # Recurse to children if not at leaf level
140
+ if not is_leaf:
141
+ child_parent_info = {
142
+ "id": chunk_id,
143
+ "text": chunk.text
144
+ }
145
+ children = _build_hierarchy(
146
+ chunk.text,
147
+ level + 1,
148
+ child_parent_info,
149
+ chunk_id
150
+ )
151
+
152
+ # Update parent with child references
153
+ if children and return_all_levels:
154
+ node.meta["child_ids"] = [c.id for c in children]
155
+
156
+ result_chunks.extend(children)
157
+
158
+ return result_chunks
159
+
160
+ # Build from root
161
+ root_parent_info = {
162
+ "id": doc_id,
163
+ "text": text[:500] # Document context
164
+ }
165
+
166
+ all_chunks = _build_hierarchy(text, 0, root_parent_info, doc_id)
167
+
168
+ # Re-index final chunks sequentially
169
+ for i, chunk in enumerate(all_chunks):
170
+ chunk.meta["global_index"] = i
171
+
172
+ return all_chunks
@@ -0,0 +1,175 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Callable, Optional
4
+ from .base import BaseChunker, Chunk
5
+ from ..utils.text import count_tokens, split_sentences
6
+
7
+ class PropositionChunker(BaseChunker):
8
+ """
9
+ Proposition-Based Chunker for Atomic Fact Extraction.
10
+
11
+ Instead of arbitrary text splits, this chunker extracts atomic propositions
12
+ (self-contained facts) from the text. Each chunk is a single, verifiable statement.
13
+
14
+ BEST-OF-BREED FEATURES:
15
+ 1. Fact-Level Granularity: Each chunk is one atomic fact.
16
+ 2. Self-Contained: Every proposition is understandable without context.
17
+ 3. Decontextualized: Pronouns and references are resolved.
18
+ 4. LLM-Powered: Uses language model for accurate extraction.
19
+
20
+ Reference: "Dense X Retrieval" paper, Greg Kamradt's proposition chunker.
21
+ """
22
+ name = "proposition"
23
+
24
+ DEFAULT_SYSTEM_PROMPT = """You are an expert at extracting atomic propositions from text.
25
+
26
+ An atomic proposition is:
27
+ - A single, self-contained fact
28
+ - Expressed in a complete sentence
29
+ - Understandable WITHOUT any additional context
30
+ - Has all pronouns replaced with their referents
31
+ - Contains no dependent references (like "this", "that", "the above")
32
+
33
+ For example:
34
+ Original: "John went to the store. He bought milk there."
35
+ Propositions:
36
+ 1. John went to the store.
37
+ 2. John bought milk at the store.
38
+
39
+ Note how "He" became "John" and "there" became "at the store"."""
40
+
41
+ DEFAULT_USER_TEMPLATE = """Extract all atomic propositions from the following text. Each proposition should be:
42
+ 1. A complete, self-contained sentence
43
+ 2. Understandable without additional context
44
+ 3. Have all pronouns resolved to their referents
45
+
46
+ TEXT:
47
+ {text}
48
+
49
+ Output each proposition on a new line, numbered. Only output the propositions, no other text.
50
+
51
+ 1."""
52
+
53
+ def __init__(self,
54
+ llm_fn: Callable[[str, str], str] = None,
55
+ system_prompt: str = None,
56
+ user_template: str = None,
57
+ max_tokens_per_call: int = 2000):
58
+ """
59
+ Initialize the proposition chunker.
60
+
61
+ Args:
62
+ llm_fn: Function that takes (system_prompt, user_message) and returns LLM response.
63
+ system_prompt: Custom system prompt
64
+ user_template: Custom user message template (must include {text})
65
+ max_tokens_per_call: Max tokens to process in one LLM call
66
+ """
67
+ self.llm_fn = llm_fn
68
+ self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
69
+ self.user_template = user_template or self.DEFAULT_USER_TEMPLATE
70
+ self.max_tokens_per_call = max_tokens_per_call
71
+
72
+ def chunk(self,
73
+ doc_id: str,
74
+ text: str,
75
+ **params) -> List[Chunk]:
76
+ """
77
+ Extract atomic propositions from text.
78
+
79
+ Args:
80
+ doc_id: Document identifier
81
+ text: Input text
82
+
83
+ Returns:
84
+ List of Chunk objects, each containing one proposition
85
+ """
86
+ if not text.strip():
87
+ return []
88
+
89
+ # Fallback if no LLM function
90
+ if self.llm_fn is None:
91
+ # Use sentence splitting as basic fallback
92
+ sentences = split_sentences(text)
93
+ return [
94
+ Chunk(
95
+ id=f"{doc_id}#prop#{i}",
96
+ doc_id=doc_id,
97
+ text=s.strip(),
98
+ meta={
99
+ "chunk_index": i,
100
+ "strategy": "proposition_fallback",
101
+ "is_atomic": False
102
+ }
103
+ ) for i, s in enumerate(sentences) if s.strip()
104
+ ]
105
+
106
+ # Process text in chunks if too long
107
+ propositions = []
108
+ sentences = split_sentences(text)
109
+
110
+ current_batch = []
111
+ current_tokens = 0
112
+
113
+ for sentence in sentences:
114
+ sent_tokens = count_tokens(sentence)
115
+
116
+ if current_tokens + sent_tokens > self.max_tokens_per_call and current_batch:
117
+ # Process current batch
118
+ batch_text = " ".join(current_batch)
119
+ batch_props = self._extract_propositions(batch_text)
120
+ propositions.extend(batch_props)
121
+ current_batch = []
122
+ current_tokens = 0
123
+
124
+ current_batch.append(sentence)
125
+ current_tokens += sent_tokens
126
+
127
+ # Process final batch
128
+ if current_batch:
129
+ batch_text = " ".join(current_batch)
130
+ batch_props = self._extract_propositions(batch_text)
131
+ propositions.extend(batch_props)
132
+
133
+ # Create chunk objects
134
+ chunks = []
135
+ for i, prop in enumerate(propositions):
136
+ if prop.strip():
137
+ chunks.append(Chunk(
138
+ id=f"{doc_id}#prop#{i}",
139
+ doc_id=doc_id,
140
+ text=prop.strip(),
141
+ meta={
142
+ "chunk_index": i,
143
+ "strategy": "proposition",
144
+ "is_atomic": True,
145
+ "token_count": count_tokens(prop)
146
+ }
147
+ ))
148
+
149
+ return chunks
150
+
151
+ def _extract_propositions(self, text: str) -> List[str]:
152
+ """Extract propositions using LLM."""
153
+ user_message = self.user_template.format(text=text)
154
+
155
+ try:
156
+ response = self.llm_fn(self.system_prompt, user_message)
157
+
158
+ # Parse numbered list
159
+ propositions = []
160
+ for line in response.strip().split("\n"):
161
+ line = line.strip()
162
+ if not line:
163
+ continue
164
+
165
+ # Remove numbering (1. 2. etc or 1) 2) etc)
166
+ import re
167
+ cleaned = re.sub(r'^[\d]+[\.\)]\s*', '', line)
168
+ if cleaned:
169
+ propositions.append(cleaned)
170
+
171
+ return propositions
172
+
173
+ except Exception as e:
174
+ # Fallback to sentence splitting
175
+ return split_sentences(text)
@@ -0,0 +1,248 @@
1
+
2
+ from __future__ import annotations
3
+ from typing import List, Optional
4
+ import ast
5
+ from .base import BaseChunker, Chunk
6
+ from ..utils.text import count_tokens
7
+
8
+ class PythonASTChunker(BaseChunker):
9
+ """
10
+ AST-Based Python Code Chunker.
11
+
12
+ Uses Python's Abstract Syntax Tree to split code at natural boundaries
13
+ (classes, functions, imports) rather than arbitrary line counts.
14
+
15
+ BEST-OF-BREED FEATURES:
16
+ 1. Structural Awareness: Splits at class/function boundaries.
17
+ 2. Docstring Preservation: Keeps docstrings with their functions.
18
+ 3. Import Grouping: Groups imports together.
19
+ 4. Nested Handling: Handles nested classes and functions.
20
+ 5. Context Prepending: Optionally prepends module/class context.
21
+ """
22
+ name = "python_ast"
23
+
24
+ def __init__(self,
25
+ include_imports_in_all: bool = True,
26
+ split_classes: bool = True,
27
+ split_functions: bool = True,
28
+ max_tokens: int = 1000,
29
+ prepend_context: bool = True):
30
+ """
31
+ Initialize the Python AST chunker.
32
+
33
+ Args:
34
+ include_imports_in_all: If True, prepend imports to every chunk.
35
+ split_classes: If True, split classes into separate chunks.
36
+ split_functions: If True, split functions into separate chunks.
37
+ max_tokens: Maximum tokens per chunk (will further split if exceeded).
38
+ prepend_context: If True, prepend class name to method chunks.
39
+ """
40
+ self.include_imports_in_all = include_imports_in_all
41
+ self.split_classes = split_classes
42
+ self.split_functions = split_functions
43
+ self.max_tokens = max_tokens
44
+ self.prepend_context = prepend_context
45
+
46
+ def chunk(self,
47
+ doc_id: str,
48
+ text: str,
49
+ **params) -> List[Chunk]:
50
+ """
51
+ Parse Python code and split at structural boundaries.
52
+
53
+ Args:
54
+ doc_id: Document identifier
55
+ text: Python source code
56
+
57
+ Returns:
58
+ List of Chunk objects
59
+ """
60
+ if not text.strip():
61
+ return []
62
+
63
+ try:
64
+ tree = ast.parse(text)
65
+ except SyntaxError:
66
+ # Fallback to line-based splitting for invalid Python
67
+ from .recursive_character import RecursiveCharacterChunker
68
+ return RecursiveCharacterChunker().chunk(doc_id, text, base_token_size=self.max_tokens)
69
+
70
+ lines = text.split("\n")
71
+ chunks = []
72
+
73
+ # Extract imports
74
+ imports = []
75
+ import_lines = set()
76
+
77
+ for node in ast.walk(tree):
78
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
79
+ if hasattr(node, 'lineno'):
80
+ start = node.lineno - 1
81
+ end = getattr(node, 'end_lineno', node.lineno)
82
+ import_text = "\n".join(lines[start:end])
83
+ imports.append(import_text)
84
+ for i in range(start, end):
85
+ import_lines.add(i)
86
+
87
+ import_block = "\n".join(imports)
88
+
89
+ # Process top-level definitions
90
+ for node in ast.iter_child_nodes(tree):
91
+ if isinstance(node, ast.ClassDef):
92
+ chunks.extend(self._process_class(doc_id, node, lines, import_block, len(chunks)))
93
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
94
+ chunks.extend(self._process_function(doc_id, node, lines, import_block, len(chunks), None))
95
+
96
+ # If no structures found, treat as single chunk
97
+ if not chunks:
98
+ return [Chunk(
99
+ id=f"{doc_id}#py#0",
100
+ doc_id=doc_id,
101
+ text=text,
102
+ meta={"chunk_index": 0, "strategy": "python_ast", "type": "module"}
103
+ )]
104
+
105
+ return chunks
106
+
107
+ def _process_class(self, doc_id: str, node: ast.ClassDef, lines: List[str],
108
+ import_block: str, start_idx: int) -> List[Chunk]:
109
+ """Process a class definition."""
110
+ chunks = []
111
+
112
+ start = node.lineno - 1
113
+ end = node.end_lineno
114
+ class_text = "\n".join(lines[start:end])
115
+ class_name = node.name
116
+
117
+ # Get class docstring
118
+ docstring = ast.get_docstring(node) or ""
119
+
120
+ if not self.split_classes:
121
+ # Return whole class as one chunk
122
+ full_text = class_text
123
+ if self.include_imports_in_all and import_block:
124
+ full_text = import_block + "\n\n" + class_text
125
+
126
+ chunks.append(Chunk(
127
+ id=f"{doc_id}#py#{start_idx}",
128
+ doc_id=doc_id,
129
+ text=full_text,
130
+ meta={
131
+ "chunk_index": start_idx,
132
+ "strategy": "python_ast",
133
+ "type": "class",
134
+ "name": class_name,
135
+ "docstring": docstring[:200],
136
+ "token_count": count_tokens(full_text)
137
+ }
138
+ ))
139
+ return chunks
140
+
141
+ # Process methods within class
142
+ methods_processed = set()
143
+
144
+ for item in node.body:
145
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
146
+ method_chunks = self._process_function(
147
+ doc_id, item, lines, import_block,
148
+ start_idx + len(chunks), class_name
149
+ )
150
+ chunks.extend(method_chunks)
151
+ methods_processed.add(item.lineno)
152
+
153
+ # If no methods or class has significant non-method content
154
+ if not methods_processed:
155
+ full_text = class_text
156
+ if self.include_imports_in_all and import_block:
157
+ full_text = import_block + "\n\n" + class_text
158
+
159
+ chunks.append(Chunk(
160
+ id=f"{doc_id}#py#{start_idx}",
161
+ doc_id=doc_id,
162
+ text=full_text,
163
+ meta={
164
+ "chunk_index": start_idx,
165
+ "strategy": "python_ast",
166
+ "type": "class",
167
+ "name": class_name,
168
+ "token_count": count_tokens(full_text)
169
+ }
170
+ ))
171
+
172
+ return chunks
173
+
174
+ def _process_function(self, doc_id: str, node, lines: List[str],
175
+ import_block: str, idx: int, class_name: Optional[str]) -> List[Chunk]:
176
+ """Process a function definition."""
177
+ start = node.lineno - 1
178
+ end = node.end_lineno
179
+ func_text = "\n".join(lines[start:end])
180
+ func_name = node.name
181
+
182
+ # Get docstring
183
+ docstring = ast.get_docstring(node) or ""
184
+
185
+ # Build context prefix
186
+ context_prefix = ""
187
+ if self.prepend_context and class_name:
188
+ context_prefix = f"# Method of class: {class_name}\n"
189
+
190
+ # Build full text
191
+ full_text = func_text
192
+ if context_prefix:
193
+ full_text = context_prefix + full_text
194
+ if self.include_imports_in_all and import_block:
195
+ full_text = import_block + "\n\n" + full_text
196
+
197
+ # Check if needs further splitting
198
+ if count_tokens(full_text) > self.max_tokens:
199
+ # Split large functions by logical blocks
200
+ from .recursive_character import RecursiveCharacterChunker
201
+ sub_chunker = RecursiveCharacterChunker()
202
+ sub_chunks = sub_chunker.chunk(
203
+ f"{doc_id}_func_{func_name}",
204
+ func_text,
205
+ base_token_size=self.max_tokens
206
+ )
207
+
208
+ chunks = []
209
+ for i, sc in enumerate(sub_chunks):
210
+ chunk_text = sc.text
211
+ if self.include_imports_in_all and import_block and i == 0:
212
+ chunk_text = import_block + "\n\n" + chunk_text
213
+ if context_prefix and i == 0:
214
+ chunk_text = context_prefix + chunk_text
215
+
216
+ chunks.append(Chunk(
217
+ id=f"{doc_id}#py#{idx + i}",
218
+ doc_id=doc_id,
219
+ text=chunk_text,
220
+ meta={
221
+ "chunk_index": idx + i,
222
+ "strategy": "python_ast",
223
+ "type": "function_part",
224
+ "name": func_name,
225
+ "class_name": class_name,
226
+ "part": i + 1,
227
+ "token_count": count_tokens(chunk_text)
228
+ }
229
+ ))
230
+ return chunks
231
+
232
+ qualified_name = f"{class_name}.{func_name}" if class_name else func_name
233
+
234
+ return [Chunk(
235
+ id=f"{doc_id}#py#{idx}",
236
+ doc_id=doc_id,
237
+ text=full_text,
238
+ meta={
239
+ "chunk_index": idx,
240
+ "strategy": "python_ast",
241
+ "type": "method" if class_name else "function",
242
+ "name": func_name,
243
+ "qualified_name": qualified_name,
244
+ "class_name": class_name,
245
+ "docstring": docstring[:200] if docstring else "",
246
+ "token_count": count_tokens(full_text)
247
+ }
248
+ )]