autochunks 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autochunk/__init__.py +9 -0
- autochunk/__main__.py +5 -0
- autochunk/adapters/__init__.py +3 -0
- autochunk/adapters/haystack.py +68 -0
- autochunk/adapters/langchain.py +81 -0
- autochunk/adapters/llamaindex.py +94 -0
- autochunk/autochunker.py +606 -0
- autochunk/chunkers/__init__.py +100 -0
- autochunk/chunkers/agentic.py +184 -0
- autochunk/chunkers/base.py +16 -0
- autochunk/chunkers/contextual_retrieval.py +151 -0
- autochunk/chunkers/fixed_length.py +110 -0
- autochunk/chunkers/html_section.py +225 -0
- autochunk/chunkers/hybrid_semantic_stat.py +199 -0
- autochunk/chunkers/layout_aware.py +192 -0
- autochunk/chunkers/parent_child.py +172 -0
- autochunk/chunkers/proposition.py +175 -0
- autochunk/chunkers/python_ast.py +248 -0
- autochunk/chunkers/recursive_character.py +215 -0
- autochunk/chunkers/semantic_local.py +140 -0
- autochunk/chunkers/sentence_aware.py +102 -0
- autochunk/cli.py +135 -0
- autochunk/config.py +76 -0
- autochunk/embedding/__init__.py +22 -0
- autochunk/embedding/adapter.py +14 -0
- autochunk/embedding/base.py +33 -0
- autochunk/embedding/hashing.py +42 -0
- autochunk/embedding/local.py +154 -0
- autochunk/embedding/ollama.py +66 -0
- autochunk/embedding/openai.py +62 -0
- autochunk/embedding/tokenizer.py +9 -0
- autochunk/enrichment/__init__.py +0 -0
- autochunk/enrichment/contextual.py +29 -0
- autochunk/eval/__init__.py +0 -0
- autochunk/eval/harness.py +177 -0
- autochunk/eval/metrics.py +27 -0
- autochunk/eval/ragas_eval.py +234 -0
- autochunk/eval/synthetic.py +104 -0
- autochunk/quality/__init__.py +31 -0
- autochunk/quality/deduplicator.py +326 -0
- autochunk/quality/overlap_optimizer.py +402 -0
- autochunk/quality/post_processor.py +245 -0
- autochunk/quality/scorer.py +459 -0
- autochunk/retrieval/__init__.py +0 -0
- autochunk/retrieval/in_memory.py +47 -0
- autochunk/retrieval/parent_child.py +4 -0
- autochunk/storage/__init__.py +0 -0
- autochunk/storage/cache.py +34 -0
- autochunk/storage/plan.py +40 -0
- autochunk/utils/__init__.py +0 -0
- autochunk/utils/hashing.py +8 -0
- autochunk/utils/io.py +176 -0
- autochunk/utils/logger.py +64 -0
- autochunk/utils/telemetry.py +44 -0
- autochunk/utils/text.py +199 -0
- autochunks-0.0.8.dist-info/METADATA +133 -0
- autochunks-0.0.8.dist-info/RECORD +61 -0
- autochunks-0.0.8.dist-info/WHEEL +5 -0
- autochunks-0.0.8.dist-info/entry_points.txt +2 -0
- autochunks-0.0.8.dist-info/licenses/LICENSE +15 -0
- autochunks-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Dict, Any, Optional
|
|
4
|
+
from .base import BaseChunker, Chunk
|
|
5
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
6
|
+
from ..utils.text import count_tokens
|
|
7
|
+
|
|
8
|
+
class ParentChildChunker(BaseChunker):
|
|
9
|
+
"""
|
|
10
|
+
Hierarchical (Small-to-Big) Parent-Child Chunker.
|
|
11
|
+
|
|
12
|
+
BEST-OF-BREED FEATURES:
|
|
13
|
+
1. N-Level Hierarchy: Configurable depth (Document > Section > Paragraph > Sentence).
|
|
14
|
+
2. Sibling References: Tracks prev_node_id and next_node_id for traversal.
|
|
15
|
+
3. Parent Context: Stores parent text in metadata for rich LLM context.
|
|
16
|
+
4. Child Overlap: Optional overlap between children for context continuity.
|
|
17
|
+
"""
|
|
18
|
+
name = "parent_child"
|
|
19
|
+
|
|
20
|
+
def __init__(self,
|
|
21
|
+
chunk_sizes: List[int] = None,
|
|
22
|
+
overlap: int = 0,
|
|
23
|
+
track_siblings: bool = True):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the chunker.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
chunk_sizes: List of sizes for each hierarchy level.
|
|
29
|
+
Default [2048, 512, 128] creates 3 levels: large -> medium -> small
|
|
30
|
+
overlap: Token overlap between sibling chunks at each level
|
|
31
|
+
track_siblings: If True, add prev/next references to metadata
|
|
32
|
+
"""
|
|
33
|
+
self.chunk_sizes = chunk_sizes or [2048, 512, 128]
|
|
34
|
+
self.overlap = overlap
|
|
35
|
+
self.track_siblings = track_siblings
|
|
36
|
+
|
|
37
|
+
def chunk(self,
|
|
38
|
+
doc_id: str,
|
|
39
|
+
text: str,
|
|
40
|
+
parent_size: int = None,
|
|
41
|
+
child_size: int = None,
|
|
42
|
+
overlap: int = None,
|
|
43
|
+
return_all_levels: bool = False,
|
|
44
|
+
**params) -> List[Chunk]:
|
|
45
|
+
"""
|
|
46
|
+
Create hierarchical chunks with parent-child relationships.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
doc_id: Document identifier
|
|
50
|
+
text: Input text
|
|
51
|
+
parent_size: Override first level size (for backward compatibility)
|
|
52
|
+
child_size: Override last level size (for backward compatibility)
|
|
53
|
+
overlap: Override overlap setting
|
|
54
|
+
return_all_levels: If True, return chunks from all levels, not just leaves
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
List of Chunk objects (leaf nodes by default, or all nodes if return_all_levels=True)
|
|
58
|
+
"""
|
|
59
|
+
# Handle legacy 2-level params
|
|
60
|
+
if parent_size and child_size:
|
|
61
|
+
chunk_sizes = [parent_size, child_size]
|
|
62
|
+
else:
|
|
63
|
+
chunk_sizes = self.chunk_sizes
|
|
64
|
+
|
|
65
|
+
if overlap is None:
|
|
66
|
+
overlap = self.overlap
|
|
67
|
+
|
|
68
|
+
base_chunker = RecursiveCharacterChunker()
|
|
69
|
+
|
|
70
|
+
all_chunks = []
|
|
71
|
+
|
|
72
|
+
def _build_hierarchy(input_text: str,
|
|
73
|
+
level: int,
|
|
74
|
+
parent_info: Dict[str, Any],
|
|
75
|
+
node_path: str) -> List[Chunk]:
|
|
76
|
+
"""
|
|
77
|
+
Recursively build chunk hierarchy.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
input_text: Text to chunk
|
|
81
|
+
level: Current hierarchy level (0 = root)
|
|
82
|
+
parent_info: Info about parent chunk
|
|
83
|
+
node_path: Path identifier for this node
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of chunks at this level (and below if return_all_levels)
|
|
87
|
+
"""
|
|
88
|
+
if level >= len(chunk_sizes):
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
current_size = chunk_sizes[level]
|
|
92
|
+
is_leaf = (level == len(chunk_sizes) - 1)
|
|
93
|
+
|
|
94
|
+
# Create chunks at this level
|
|
95
|
+
level_chunks = base_chunker.chunk(
|
|
96
|
+
doc_id=f"{doc_id}_L{level}",
|
|
97
|
+
text=input_text,
|
|
98
|
+
base_token_size=current_size,
|
|
99
|
+
overlap=overlap
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
result_chunks = []
|
|
103
|
+
|
|
104
|
+
for idx, chunk in enumerate(level_chunks):
|
|
105
|
+
chunk_id = f"{node_path}#L{level}#{idx}"
|
|
106
|
+
|
|
107
|
+
# Build metadata with parent info
|
|
108
|
+
meta = {
|
|
109
|
+
"chunk_index": idx,
|
|
110
|
+
"level": level,
|
|
111
|
+
"is_leaf": is_leaf,
|
|
112
|
+
"strategy": "parent_child",
|
|
113
|
+
"token_count": count_tokens(chunk.text)
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Add parent references
|
|
117
|
+
if parent_info:
|
|
118
|
+
meta["parent_id"] = parent_info.get("id")
|
|
119
|
+
meta["parent_text"] = parent_info.get("text", "")[:500] # Truncate for efficiency
|
|
120
|
+
|
|
121
|
+
# Add sibling references
|
|
122
|
+
if self.track_siblings:
|
|
123
|
+
if idx > 0:
|
|
124
|
+
meta["prev_sibling_id"] = f"{node_path}#L{level}#{idx - 1}"
|
|
125
|
+
if idx < len(level_chunks) - 1:
|
|
126
|
+
meta["next_sibling_id"] = f"{node_path}#L{level}#{idx + 1}"
|
|
127
|
+
|
|
128
|
+
node = Chunk(
|
|
129
|
+
id=chunk_id,
|
|
130
|
+
doc_id=doc_id,
|
|
131
|
+
text=chunk.text,
|
|
132
|
+
meta=meta
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Add to results based on return_all_levels setting
|
|
136
|
+
if return_all_levels or is_leaf:
|
|
137
|
+
result_chunks.append(node)
|
|
138
|
+
|
|
139
|
+
# Recurse to children if not at leaf level
|
|
140
|
+
if not is_leaf:
|
|
141
|
+
child_parent_info = {
|
|
142
|
+
"id": chunk_id,
|
|
143
|
+
"text": chunk.text
|
|
144
|
+
}
|
|
145
|
+
children = _build_hierarchy(
|
|
146
|
+
chunk.text,
|
|
147
|
+
level + 1,
|
|
148
|
+
child_parent_info,
|
|
149
|
+
chunk_id
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Update parent with child references
|
|
153
|
+
if children and return_all_levels:
|
|
154
|
+
node.meta["child_ids"] = [c.id for c in children]
|
|
155
|
+
|
|
156
|
+
result_chunks.extend(children)
|
|
157
|
+
|
|
158
|
+
return result_chunks
|
|
159
|
+
|
|
160
|
+
# Build from root
|
|
161
|
+
root_parent_info = {
|
|
162
|
+
"id": doc_id,
|
|
163
|
+
"text": text[:500] # Document context
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
all_chunks = _build_hierarchy(text, 0, root_parent_info, doc_id)
|
|
167
|
+
|
|
168
|
+
# Re-index final chunks sequentially
|
|
169
|
+
for i, chunk in enumerate(all_chunks):
|
|
170
|
+
chunk.meta["global_index"] = i
|
|
171
|
+
|
|
172
|
+
return all_chunks
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Callable, Optional
|
|
4
|
+
from .base import BaseChunker, Chunk
|
|
5
|
+
from ..utils.text import count_tokens, split_sentences
|
|
6
|
+
|
|
7
|
+
class PropositionChunker(BaseChunker):
|
|
8
|
+
"""
|
|
9
|
+
Proposition-Based Chunker for Atomic Fact Extraction.
|
|
10
|
+
|
|
11
|
+
Instead of arbitrary text splits, this chunker extracts atomic propositions
|
|
12
|
+
(self-contained facts) from the text. Each chunk is a single, verifiable statement.
|
|
13
|
+
|
|
14
|
+
BEST-OF-BREED FEATURES:
|
|
15
|
+
1. Fact-Level Granularity: Each chunk is one atomic fact.
|
|
16
|
+
2. Self-Contained: Every proposition is understandable without context.
|
|
17
|
+
3. Decontextualized: Pronouns and references are resolved.
|
|
18
|
+
4. LLM-Powered: Uses language model for accurate extraction.
|
|
19
|
+
|
|
20
|
+
Reference: "Dense X Retrieval" paper, Greg Kamradt's proposition chunker.
|
|
21
|
+
"""
|
|
22
|
+
name = "proposition"
|
|
23
|
+
|
|
24
|
+
DEFAULT_SYSTEM_PROMPT = """You are an expert at extracting atomic propositions from text.
|
|
25
|
+
|
|
26
|
+
An atomic proposition is:
|
|
27
|
+
- A single, self-contained fact
|
|
28
|
+
- Expressed in a complete sentence
|
|
29
|
+
- Understandable WITHOUT any additional context
|
|
30
|
+
- Has all pronouns replaced with their referents
|
|
31
|
+
- Contains no dependent references (like "this", "that", "the above")
|
|
32
|
+
|
|
33
|
+
For example:
|
|
34
|
+
Original: "John went to the store. He bought milk there."
|
|
35
|
+
Propositions:
|
|
36
|
+
1. John went to the store.
|
|
37
|
+
2. John bought milk at the store.
|
|
38
|
+
|
|
39
|
+
Note how "He" became "John" and "there" became "at the store"."""
|
|
40
|
+
|
|
41
|
+
DEFAULT_USER_TEMPLATE = """Extract all atomic propositions from the following text. Each proposition should be:
|
|
42
|
+
1. A complete, self-contained sentence
|
|
43
|
+
2. Understandable without additional context
|
|
44
|
+
3. Have all pronouns resolved to their referents
|
|
45
|
+
|
|
46
|
+
TEXT:
|
|
47
|
+
{text}
|
|
48
|
+
|
|
49
|
+
Output each proposition on a new line, numbered. Only output the propositions, no other text.
|
|
50
|
+
|
|
51
|
+
1."""
|
|
52
|
+
|
|
53
|
+
def __init__(self,
|
|
54
|
+
llm_fn: Callable[[str, str], str] = None,
|
|
55
|
+
system_prompt: str = None,
|
|
56
|
+
user_template: str = None,
|
|
57
|
+
max_tokens_per_call: int = 2000):
|
|
58
|
+
"""
|
|
59
|
+
Initialize the proposition chunker.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
llm_fn: Function that takes (system_prompt, user_message) and returns LLM response.
|
|
63
|
+
system_prompt: Custom system prompt
|
|
64
|
+
user_template: Custom user message template (must include {text})
|
|
65
|
+
max_tokens_per_call: Max tokens to process in one LLM call
|
|
66
|
+
"""
|
|
67
|
+
self.llm_fn = llm_fn
|
|
68
|
+
self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
|
|
69
|
+
self.user_template = user_template or self.DEFAULT_USER_TEMPLATE
|
|
70
|
+
self.max_tokens_per_call = max_tokens_per_call
|
|
71
|
+
|
|
72
|
+
def chunk(self,
|
|
73
|
+
doc_id: str,
|
|
74
|
+
text: str,
|
|
75
|
+
**params) -> List[Chunk]:
|
|
76
|
+
"""
|
|
77
|
+
Extract atomic propositions from text.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
doc_id: Document identifier
|
|
81
|
+
text: Input text
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of Chunk objects, each containing one proposition
|
|
85
|
+
"""
|
|
86
|
+
if not text.strip():
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
# Fallback if no LLM function
|
|
90
|
+
if self.llm_fn is None:
|
|
91
|
+
# Use sentence splitting as basic fallback
|
|
92
|
+
sentences = split_sentences(text)
|
|
93
|
+
return [
|
|
94
|
+
Chunk(
|
|
95
|
+
id=f"{doc_id}#prop#{i}",
|
|
96
|
+
doc_id=doc_id,
|
|
97
|
+
text=s.strip(),
|
|
98
|
+
meta={
|
|
99
|
+
"chunk_index": i,
|
|
100
|
+
"strategy": "proposition_fallback",
|
|
101
|
+
"is_atomic": False
|
|
102
|
+
}
|
|
103
|
+
) for i, s in enumerate(sentences) if s.strip()
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# Process text in chunks if too long
|
|
107
|
+
propositions = []
|
|
108
|
+
sentences = split_sentences(text)
|
|
109
|
+
|
|
110
|
+
current_batch = []
|
|
111
|
+
current_tokens = 0
|
|
112
|
+
|
|
113
|
+
for sentence in sentences:
|
|
114
|
+
sent_tokens = count_tokens(sentence)
|
|
115
|
+
|
|
116
|
+
if current_tokens + sent_tokens > self.max_tokens_per_call and current_batch:
|
|
117
|
+
# Process current batch
|
|
118
|
+
batch_text = " ".join(current_batch)
|
|
119
|
+
batch_props = self._extract_propositions(batch_text)
|
|
120
|
+
propositions.extend(batch_props)
|
|
121
|
+
current_batch = []
|
|
122
|
+
current_tokens = 0
|
|
123
|
+
|
|
124
|
+
current_batch.append(sentence)
|
|
125
|
+
current_tokens += sent_tokens
|
|
126
|
+
|
|
127
|
+
# Process final batch
|
|
128
|
+
if current_batch:
|
|
129
|
+
batch_text = " ".join(current_batch)
|
|
130
|
+
batch_props = self._extract_propositions(batch_text)
|
|
131
|
+
propositions.extend(batch_props)
|
|
132
|
+
|
|
133
|
+
# Create chunk objects
|
|
134
|
+
chunks = []
|
|
135
|
+
for i, prop in enumerate(propositions):
|
|
136
|
+
if prop.strip():
|
|
137
|
+
chunks.append(Chunk(
|
|
138
|
+
id=f"{doc_id}#prop#{i}",
|
|
139
|
+
doc_id=doc_id,
|
|
140
|
+
text=prop.strip(),
|
|
141
|
+
meta={
|
|
142
|
+
"chunk_index": i,
|
|
143
|
+
"strategy": "proposition",
|
|
144
|
+
"is_atomic": True,
|
|
145
|
+
"token_count": count_tokens(prop)
|
|
146
|
+
}
|
|
147
|
+
))
|
|
148
|
+
|
|
149
|
+
return chunks
|
|
150
|
+
|
|
151
|
+
def _extract_propositions(self, text: str) -> List[str]:
|
|
152
|
+
"""Extract propositions using LLM."""
|
|
153
|
+
user_message = self.user_template.format(text=text)
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
response = self.llm_fn(self.system_prompt, user_message)
|
|
157
|
+
|
|
158
|
+
# Parse numbered list
|
|
159
|
+
propositions = []
|
|
160
|
+
for line in response.strip().split("\n"):
|
|
161
|
+
line = line.strip()
|
|
162
|
+
if not line:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Remove numbering (1. 2. etc or 1) 2) etc)
|
|
166
|
+
import re
|
|
167
|
+
cleaned = re.sub(r'^[\d]+[\.\)]\s*', '', line)
|
|
168
|
+
if cleaned:
|
|
169
|
+
propositions.append(cleaned)
|
|
170
|
+
|
|
171
|
+
return propositions
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
# Fallback to sentence splitting
|
|
175
|
+
return split_sentences(text)
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
import ast
|
|
5
|
+
from .base import BaseChunker, Chunk
|
|
6
|
+
from ..utils.text import count_tokens
|
|
7
|
+
|
|
8
|
+
class PythonASTChunker(BaseChunker):
|
|
9
|
+
"""
|
|
10
|
+
AST-Based Python Code Chunker.
|
|
11
|
+
|
|
12
|
+
Uses Python's Abstract Syntax Tree to split code at natural boundaries
|
|
13
|
+
(classes, functions, imports) rather than arbitrary line counts.
|
|
14
|
+
|
|
15
|
+
BEST-OF-BREED FEATURES:
|
|
16
|
+
1. Structural Awareness: Splits at class/function boundaries.
|
|
17
|
+
2. Docstring Preservation: Keeps docstrings with their functions.
|
|
18
|
+
3. Import Grouping: Groups imports together.
|
|
19
|
+
4. Nested Handling: Handles nested classes and functions.
|
|
20
|
+
5. Context Prepending: Optionally prepends module/class context.
|
|
21
|
+
"""
|
|
22
|
+
name = "python_ast"
|
|
23
|
+
|
|
24
|
+
def __init__(self,
|
|
25
|
+
include_imports_in_all: bool = True,
|
|
26
|
+
split_classes: bool = True,
|
|
27
|
+
split_functions: bool = True,
|
|
28
|
+
max_tokens: int = 1000,
|
|
29
|
+
prepend_context: bool = True):
|
|
30
|
+
"""
|
|
31
|
+
Initialize the Python AST chunker.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
include_imports_in_all: If True, prepend imports to every chunk.
|
|
35
|
+
split_classes: If True, split classes into separate chunks.
|
|
36
|
+
split_functions: If True, split functions into separate chunks.
|
|
37
|
+
max_tokens: Maximum tokens per chunk (will further split if exceeded).
|
|
38
|
+
prepend_context: If True, prepend class name to method chunks.
|
|
39
|
+
"""
|
|
40
|
+
self.include_imports_in_all = include_imports_in_all
|
|
41
|
+
self.split_classes = split_classes
|
|
42
|
+
self.split_functions = split_functions
|
|
43
|
+
self.max_tokens = max_tokens
|
|
44
|
+
self.prepend_context = prepend_context
|
|
45
|
+
|
|
46
|
+
def chunk(self,
|
|
47
|
+
doc_id: str,
|
|
48
|
+
text: str,
|
|
49
|
+
**params) -> List[Chunk]:
|
|
50
|
+
"""
|
|
51
|
+
Parse Python code and split at structural boundaries.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
doc_id: Document identifier
|
|
55
|
+
text: Python source code
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
List of Chunk objects
|
|
59
|
+
"""
|
|
60
|
+
if not text.strip():
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
tree = ast.parse(text)
|
|
65
|
+
except SyntaxError:
|
|
66
|
+
# Fallback to line-based splitting for invalid Python
|
|
67
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
68
|
+
return RecursiveCharacterChunker().chunk(doc_id, text, base_token_size=self.max_tokens)
|
|
69
|
+
|
|
70
|
+
lines = text.split("\n")
|
|
71
|
+
chunks = []
|
|
72
|
+
|
|
73
|
+
# Extract imports
|
|
74
|
+
imports = []
|
|
75
|
+
import_lines = set()
|
|
76
|
+
|
|
77
|
+
for node in ast.walk(tree):
|
|
78
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
79
|
+
if hasattr(node, 'lineno'):
|
|
80
|
+
start = node.lineno - 1
|
|
81
|
+
end = getattr(node, 'end_lineno', node.lineno)
|
|
82
|
+
import_text = "\n".join(lines[start:end])
|
|
83
|
+
imports.append(import_text)
|
|
84
|
+
for i in range(start, end):
|
|
85
|
+
import_lines.add(i)
|
|
86
|
+
|
|
87
|
+
import_block = "\n".join(imports)
|
|
88
|
+
|
|
89
|
+
# Process top-level definitions
|
|
90
|
+
for node in ast.iter_child_nodes(tree):
|
|
91
|
+
if isinstance(node, ast.ClassDef):
|
|
92
|
+
chunks.extend(self._process_class(doc_id, node, lines, import_block, len(chunks)))
|
|
93
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
94
|
+
chunks.extend(self._process_function(doc_id, node, lines, import_block, len(chunks), None))
|
|
95
|
+
|
|
96
|
+
# If no structures found, treat as single chunk
|
|
97
|
+
if not chunks:
|
|
98
|
+
return [Chunk(
|
|
99
|
+
id=f"{doc_id}#py#0",
|
|
100
|
+
doc_id=doc_id,
|
|
101
|
+
text=text,
|
|
102
|
+
meta={"chunk_index": 0, "strategy": "python_ast", "type": "module"}
|
|
103
|
+
)]
|
|
104
|
+
|
|
105
|
+
return chunks
|
|
106
|
+
|
|
107
|
+
def _process_class(self, doc_id: str, node: ast.ClassDef, lines: List[str],
|
|
108
|
+
import_block: str, start_idx: int) -> List[Chunk]:
|
|
109
|
+
"""Process a class definition."""
|
|
110
|
+
chunks = []
|
|
111
|
+
|
|
112
|
+
start = node.lineno - 1
|
|
113
|
+
end = node.end_lineno
|
|
114
|
+
class_text = "\n".join(lines[start:end])
|
|
115
|
+
class_name = node.name
|
|
116
|
+
|
|
117
|
+
# Get class docstring
|
|
118
|
+
docstring = ast.get_docstring(node) or ""
|
|
119
|
+
|
|
120
|
+
if not self.split_classes:
|
|
121
|
+
# Return whole class as one chunk
|
|
122
|
+
full_text = class_text
|
|
123
|
+
if self.include_imports_in_all and import_block:
|
|
124
|
+
full_text = import_block + "\n\n" + class_text
|
|
125
|
+
|
|
126
|
+
chunks.append(Chunk(
|
|
127
|
+
id=f"{doc_id}#py#{start_idx}",
|
|
128
|
+
doc_id=doc_id,
|
|
129
|
+
text=full_text,
|
|
130
|
+
meta={
|
|
131
|
+
"chunk_index": start_idx,
|
|
132
|
+
"strategy": "python_ast",
|
|
133
|
+
"type": "class",
|
|
134
|
+
"name": class_name,
|
|
135
|
+
"docstring": docstring[:200],
|
|
136
|
+
"token_count": count_tokens(full_text)
|
|
137
|
+
}
|
|
138
|
+
))
|
|
139
|
+
return chunks
|
|
140
|
+
|
|
141
|
+
# Process methods within class
|
|
142
|
+
methods_processed = set()
|
|
143
|
+
|
|
144
|
+
for item in node.body:
|
|
145
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
146
|
+
method_chunks = self._process_function(
|
|
147
|
+
doc_id, item, lines, import_block,
|
|
148
|
+
start_idx + len(chunks), class_name
|
|
149
|
+
)
|
|
150
|
+
chunks.extend(method_chunks)
|
|
151
|
+
methods_processed.add(item.lineno)
|
|
152
|
+
|
|
153
|
+
# If no methods or class has significant non-method content
|
|
154
|
+
if not methods_processed:
|
|
155
|
+
full_text = class_text
|
|
156
|
+
if self.include_imports_in_all and import_block:
|
|
157
|
+
full_text = import_block + "\n\n" + class_text
|
|
158
|
+
|
|
159
|
+
chunks.append(Chunk(
|
|
160
|
+
id=f"{doc_id}#py#{start_idx}",
|
|
161
|
+
doc_id=doc_id,
|
|
162
|
+
text=full_text,
|
|
163
|
+
meta={
|
|
164
|
+
"chunk_index": start_idx,
|
|
165
|
+
"strategy": "python_ast",
|
|
166
|
+
"type": "class",
|
|
167
|
+
"name": class_name,
|
|
168
|
+
"token_count": count_tokens(full_text)
|
|
169
|
+
}
|
|
170
|
+
))
|
|
171
|
+
|
|
172
|
+
return chunks
|
|
173
|
+
|
|
174
|
+
def _process_function(self, doc_id: str, node, lines: List[str],
|
|
175
|
+
import_block: str, idx: int, class_name: Optional[str]) -> List[Chunk]:
|
|
176
|
+
"""Process a function definition."""
|
|
177
|
+
start = node.lineno - 1
|
|
178
|
+
end = node.end_lineno
|
|
179
|
+
func_text = "\n".join(lines[start:end])
|
|
180
|
+
func_name = node.name
|
|
181
|
+
|
|
182
|
+
# Get docstring
|
|
183
|
+
docstring = ast.get_docstring(node) or ""
|
|
184
|
+
|
|
185
|
+
# Build context prefix
|
|
186
|
+
context_prefix = ""
|
|
187
|
+
if self.prepend_context and class_name:
|
|
188
|
+
context_prefix = f"# Method of class: {class_name}\n"
|
|
189
|
+
|
|
190
|
+
# Build full text
|
|
191
|
+
full_text = func_text
|
|
192
|
+
if context_prefix:
|
|
193
|
+
full_text = context_prefix + full_text
|
|
194
|
+
if self.include_imports_in_all and import_block:
|
|
195
|
+
full_text = import_block + "\n\n" + full_text
|
|
196
|
+
|
|
197
|
+
# Check if needs further splitting
|
|
198
|
+
if count_tokens(full_text) > self.max_tokens:
|
|
199
|
+
# Split large functions by logical blocks
|
|
200
|
+
from .recursive_character import RecursiveCharacterChunker
|
|
201
|
+
sub_chunker = RecursiveCharacterChunker()
|
|
202
|
+
sub_chunks = sub_chunker.chunk(
|
|
203
|
+
f"{doc_id}_func_{func_name}",
|
|
204
|
+
func_text,
|
|
205
|
+
base_token_size=self.max_tokens
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
chunks = []
|
|
209
|
+
for i, sc in enumerate(sub_chunks):
|
|
210
|
+
chunk_text = sc.text
|
|
211
|
+
if self.include_imports_in_all and import_block and i == 0:
|
|
212
|
+
chunk_text = import_block + "\n\n" + chunk_text
|
|
213
|
+
if context_prefix and i == 0:
|
|
214
|
+
chunk_text = context_prefix + chunk_text
|
|
215
|
+
|
|
216
|
+
chunks.append(Chunk(
|
|
217
|
+
id=f"{doc_id}#py#{idx + i}",
|
|
218
|
+
doc_id=doc_id,
|
|
219
|
+
text=chunk_text,
|
|
220
|
+
meta={
|
|
221
|
+
"chunk_index": idx + i,
|
|
222
|
+
"strategy": "python_ast",
|
|
223
|
+
"type": "function_part",
|
|
224
|
+
"name": func_name,
|
|
225
|
+
"class_name": class_name,
|
|
226
|
+
"part": i + 1,
|
|
227
|
+
"token_count": count_tokens(chunk_text)
|
|
228
|
+
}
|
|
229
|
+
))
|
|
230
|
+
return chunks
|
|
231
|
+
|
|
232
|
+
qualified_name = f"{class_name}.{func_name}" if class_name else func_name
|
|
233
|
+
|
|
234
|
+
return [Chunk(
|
|
235
|
+
id=f"{doc_id}#py#{idx}",
|
|
236
|
+
doc_id=doc_id,
|
|
237
|
+
text=full_text,
|
|
238
|
+
meta={
|
|
239
|
+
"chunk_index": idx,
|
|
240
|
+
"strategy": "python_ast",
|
|
241
|
+
"type": "method" if class_name else "function",
|
|
242
|
+
"name": func_name,
|
|
243
|
+
"qualified_name": qualified_name,
|
|
244
|
+
"class_name": class_name,
|
|
245
|
+
"docstring": docstring[:200] if docstring else "",
|
|
246
|
+
"token_count": count_tokens(full_text)
|
|
247
|
+
}
|
|
248
|
+
)]
|