sec2md 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +36 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +135 -0
- sec2md/chunker/markdown_chunk.py +133 -0
- sec2md/chunker/markdown_chunker.py +270 -0
- sec2md/chunking.py +179 -0
- sec2md/core.py +93 -0
- sec2md/models.py +400 -0
- sec2md/parser.py +1217 -0
- sec2md/section_extractor.py +623 -0
- sec2md/sections.py +84 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.5.dist-info/METADATA +216 -0
- sec2md-0.1.5.dist-info/RECORD +19 -0
- sec2md-0.1.5.dist-info/WHEEL +5 -0
- sec2md-0.1.5.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
from pydantic import BaseModel, Field, computed_field
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import tiktoken
|
|
7
|
+
TIKTOKEN_AVAILABLE = True
|
|
8
|
+
except ImportError:
|
|
9
|
+
TIKTOKEN_AVAILABLE = False
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def estimate_tokens(text: str) -> int:
|
|
13
|
+
"""
|
|
14
|
+
Calculate token count for text.
|
|
15
|
+
|
|
16
|
+
Uses tiktoken with cl100k_base encoding (gpt-3.5-turbo/gpt-4) if available.
|
|
17
|
+
Falls back to character/4 heuristic if tiktoken is not installed.
|
|
18
|
+
"""
|
|
19
|
+
if TIKTOKEN_AVAILABLE:
|
|
20
|
+
encoding = tiktoken.get_encoding("cl100k_base")
|
|
21
|
+
return len(encoding.encode(text))
|
|
22
|
+
else:
|
|
23
|
+
# Fallback: simple heuristic
|
|
24
|
+
return max(1, len(text) // 4)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def split_sentences(text: str) -> List[str]:
|
|
28
|
+
"""Simple regex-based sentence splitter"""
|
|
29
|
+
# Split on .!? followed by whitespace and capital letter or end of string
|
|
30
|
+
# Handles common abbreviations like Mr., Dr., Inc., etc.
|
|
31
|
+
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
|
|
32
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BaseBlock(BaseModel):
|
|
36
|
+
"""Base class for markdown blocks."""
|
|
37
|
+
block_type: str = Field(..., description="Type of markdown block")
|
|
38
|
+
content: str = Field(..., description="Block content")
|
|
39
|
+
page: int = Field(..., description="Page number")
|
|
40
|
+
|
|
41
|
+
model_config = {"frozen": False}
|
|
42
|
+
|
|
43
|
+
@computed_field
|
|
44
|
+
@property
|
|
45
|
+
def tokens(self) -> int:
|
|
46
|
+
return estimate_tokens(self.content)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Sentence(BaseModel):
|
|
50
|
+
"""Sentence within a text block."""
|
|
51
|
+
content: str = Field(..., description="Sentence content")
|
|
52
|
+
|
|
53
|
+
model_config = {"frozen": False}
|
|
54
|
+
|
|
55
|
+
@computed_field
|
|
56
|
+
@property
|
|
57
|
+
def tokens(self) -> int:
|
|
58
|
+
return estimate_tokens(self.content)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class TextBlock(BaseBlock):
|
|
62
|
+
block_type: str = Field(default='Text', description="Text block type")
|
|
63
|
+
|
|
64
|
+
@computed_field
|
|
65
|
+
@property
|
|
66
|
+
def sentences(self) -> List[Sentence]:
|
|
67
|
+
"""Returns the text block sentences"""
|
|
68
|
+
return [Sentence(content=content) for content in split_sentences(self.content)]
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_sentences(cls, sentences: List[Sentence], page: int):
|
|
72
|
+
content = " ".join([sentence.content for sentence in sentences])
|
|
73
|
+
return cls(content=content, page=page, block_type='Text')
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class AudioParagraphBlock(BaseBlock):
|
|
77
|
+
block_type: str = Field(default="Text", description="Audio paragraph block type")
|
|
78
|
+
paragraph_id: int = Field(..., description="Paragraph ID")
|
|
79
|
+
audio_start: float = Field(..., description="Audio start time")
|
|
80
|
+
audio_end: float = Field(..., description="Audio end time")
|
|
81
|
+
|
|
82
|
+
@computed_field
|
|
83
|
+
@property
|
|
84
|
+
def sentences(self) -> List[Sentence]:
|
|
85
|
+
"""Returns the text block sentences"""
|
|
86
|
+
return [Sentence(content=content) for content in split_sentences(self.content)]
|
|
87
|
+
|
|
88
|
+
def format(self) -> dict:
|
|
89
|
+
"""Formats the audio paragraphs"""
|
|
90
|
+
return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TableBlock(BaseModel):
|
|
94
|
+
block_type: str = Field(default='Table', description="Table block type")
|
|
95
|
+
content: str = Field(..., description="Table content")
|
|
96
|
+
page: int = Field(..., description="Page number")
|
|
97
|
+
|
|
98
|
+
model_config = {"frozen": False}
|
|
99
|
+
|
|
100
|
+
def __init__(self, **data):
|
|
101
|
+
if 'content' in data:
|
|
102
|
+
data['content'] = self._to_minified_markdown_static(data['content'])
|
|
103
|
+
super().__init__(**data)
|
|
104
|
+
|
|
105
|
+
@computed_field
|
|
106
|
+
@property
|
|
107
|
+
def tokens(self) -> int:
|
|
108
|
+
return estimate_tokens(self.content)
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def _to_minified_markdown_static(content: str) -> str:
|
|
112
|
+
"""Returns the table in a Minified Markdown format"""
|
|
113
|
+
lines = content.split('\n')
|
|
114
|
+
cleaned_lines = []
|
|
115
|
+
|
|
116
|
+
for i, line in enumerate(lines):
|
|
117
|
+
if not line.strip():
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
parts = line.split('|')
|
|
121
|
+
cleaned_parts = [re.sub(r'\s+', ' ', part.strip()) for part in parts]
|
|
122
|
+
cleaned_line = '|'.join(cleaned_parts)
|
|
123
|
+
|
|
124
|
+
if i == 1:
|
|
125
|
+
num_cols = len(cleaned_parts) - 1
|
|
126
|
+
separator = '|' + '|'.join(['---'] * num_cols) + '|'
|
|
127
|
+
cleaned_lines.append(separator)
|
|
128
|
+
else:
|
|
129
|
+
cleaned_lines.append(cleaned_line)
|
|
130
|
+
|
|
131
|
+
return '\n'.join(cleaned_lines)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class HeaderBlock(BaseBlock):
|
|
135
|
+
block_type: str = Field(default='Header', description="Header block type")
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from typing import List, Optional, Tuple, TYPE_CHECKING
|
|
2
|
+
from pydantic import BaseModel, Field, computed_field
|
|
3
|
+
|
|
4
|
+
from sec2md.chunker.markdown_blocks import BaseBlock
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from sec2md.models import Element
|
|
8
|
+
else:
|
|
9
|
+
Element = 'Element' # Forward reference for Pydantic
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MarkdownChunk(BaseModel):
|
|
13
|
+
"""Represents a chunk of markdown content that can be embedded"""
|
|
14
|
+
|
|
15
|
+
blocks: List[BaseBlock] = Field(..., description="List of markdown blocks in this chunk")
|
|
16
|
+
header: Optional[str] = Field(None, description="Optional header for embedding context")
|
|
17
|
+
elements: List['Element'] = Field(default_factory=list, description="Element objects for citation")
|
|
18
|
+
vector: Optional[List[float]] = Field(None, description="Vector embedding for this chunk")
|
|
19
|
+
|
|
20
|
+
model_config = {"frozen": False, "arbitrary_types_allowed": True}
|
|
21
|
+
|
|
22
|
+
@computed_field
|
|
23
|
+
@property
|
|
24
|
+
def page(self) -> int:
|
|
25
|
+
"""First page (for backward compatibility)."""
|
|
26
|
+
return self.blocks[0].page if self.blocks else 1
|
|
27
|
+
|
|
28
|
+
def set_vector(self, vector: List[float]):
|
|
29
|
+
"""Set the vector embedding for this chunk"""
|
|
30
|
+
self.vector = vector
|
|
31
|
+
|
|
32
|
+
@computed_field
|
|
33
|
+
@property
|
|
34
|
+
def start_page(self) -> int:
|
|
35
|
+
"""First page this chunk appears on (from elements or blocks)."""
|
|
36
|
+
# Prefer elements since they have actual page info from the document
|
|
37
|
+
if self.elements:
|
|
38
|
+
return min(e.page_start for e in self.elements)
|
|
39
|
+
elif self.blocks:
|
|
40
|
+
return min(block.page for block in self.blocks)
|
|
41
|
+
return self.page
|
|
42
|
+
|
|
43
|
+
@computed_field
|
|
44
|
+
@property
|
|
45
|
+
def end_page(self) -> int:
|
|
46
|
+
"""Last page this chunk appears on (from elements or blocks)."""
|
|
47
|
+
# Prefer elements since they have actual page info from the document
|
|
48
|
+
if self.elements:
|
|
49
|
+
return max(e.page_end for e in self.elements)
|
|
50
|
+
elif self.blocks:
|
|
51
|
+
return max(block.page for block in self.blocks)
|
|
52
|
+
return self.page
|
|
53
|
+
|
|
54
|
+
@computed_field
|
|
55
|
+
@property
|
|
56
|
+
def page_range(self) -> Tuple[int, int]:
|
|
57
|
+
"""(start_page, end_page) tuple."""
|
|
58
|
+
return (self.start_page, self.end_page)
|
|
59
|
+
|
|
60
|
+
@computed_field
|
|
61
|
+
@property
|
|
62
|
+
def content(self) -> str:
|
|
63
|
+
"""Get the text content of this chunk"""
|
|
64
|
+
return "\n".join([block.content for block in self.blocks])
|
|
65
|
+
|
|
66
|
+
@computed_field
|
|
67
|
+
@property
|
|
68
|
+
def data(self) -> List[dict]:
|
|
69
|
+
"""Returns a list of block data grouped by page with ONLY the chunk's content"""
|
|
70
|
+
page_blocks = {}
|
|
71
|
+
|
|
72
|
+
for block in self.blocks:
|
|
73
|
+
if block.page not in page_blocks:
|
|
74
|
+
page_blocks[block.page] = []
|
|
75
|
+
page_blocks[block.page].append(block)
|
|
76
|
+
|
|
77
|
+
page_content_data = []
|
|
78
|
+
for page, blocks in page_blocks.items():
|
|
79
|
+
# Only include the content from blocks in THIS chunk, not full page content
|
|
80
|
+
page_content = "\n".join(block.content for block in blocks)
|
|
81
|
+
if not page_content.strip():
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
page_content_data.append({
|
|
85
|
+
"page": page,
|
|
86
|
+
"content": page_content
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
return sorted(page_content_data, key=lambda x: x["page"])
|
|
90
|
+
|
|
91
|
+
@computed_field
|
|
92
|
+
@property
|
|
93
|
+
def pages(self) -> List[dict]:
|
|
94
|
+
"""Returns a list of pages with ONLY this chunk's content (not full page content)"""
|
|
95
|
+
return self.data
|
|
96
|
+
|
|
97
|
+
@computed_field
|
|
98
|
+
@property
|
|
99
|
+
def embedding_text(self) -> str:
|
|
100
|
+
"""Get the text to use for embedding, with optional header prepended"""
|
|
101
|
+
if self.header:
|
|
102
|
+
return f"{self.header}\n\n...\n\n{self.content}"
|
|
103
|
+
return self.content
|
|
104
|
+
|
|
105
|
+
@computed_field
|
|
106
|
+
@property
|
|
107
|
+
def has_table(self) -> bool:
|
|
108
|
+
"""Returns True if this chunk contains one or more table blocks"""
|
|
109
|
+
return any(block.block_type == 'Table' for block in self.blocks)
|
|
110
|
+
|
|
111
|
+
@computed_field
|
|
112
|
+
@property
|
|
113
|
+
def num_tokens(self) -> int:
|
|
114
|
+
"""Returns the total number of tokens in this chunk"""
|
|
115
|
+
return sum(block.tokens for block in self.blocks)
|
|
116
|
+
|
|
117
|
+
@computed_field
|
|
118
|
+
@property
|
|
119
|
+
def element_ids(self) -> List[str]:
|
|
120
|
+
"""List of element IDs for citations."""
|
|
121
|
+
return [e.id for e in self.elements] if self.elements else []
|
|
122
|
+
|
|
123
|
+
def to_dict(self) -> dict:
|
|
124
|
+
"""Alias for model_dump() - kept for backward compat during alpha."""
|
|
125
|
+
return self.model_dump()
|
|
126
|
+
|
|
127
|
+
def __repr__(self):
|
|
128
|
+
pages_str = f"{self.start_page}-{self.end_page}" if self.start_page != self.end_page else str(self.start_page)
|
|
129
|
+
return f"MarkdownChunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
|
|
130
|
+
|
|
131
|
+
def _repr_markdown_(self):
|
|
132
|
+
"""This method is called by IPython to display as Markdown"""
|
|
133
|
+
return self.content
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Union, Tuple, List, Dict, Any
|
|
3
|
+
|
|
4
|
+
from sec2md.chunker.markdown_chunk import MarkdownChunk
|
|
5
|
+
from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
|
|
6
|
+
|
|
7
|
+
# Rebuild MarkdownChunk after Element is defined
|
|
8
|
+
from sec2md.models import Element
|
|
9
|
+
MarkdownChunk.model_rebuild()
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MarkdownChunker:
|
|
15
|
+
"""Splits markdown content into chunks"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
|
|
18
|
+
self.chunk_size = chunk_size
|
|
19
|
+
self.chunk_overlap = chunk_overlap
|
|
20
|
+
|
|
21
|
+
def split(self, pages: List[Any], header: str = None) -> List[MarkdownChunk]:
|
|
22
|
+
"""Split the pages into chunks with optional header for embedding context.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
pages: List of Page objects
|
|
26
|
+
header: Optional header to prepend to each chunk's embedding_text
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
List of MarkdownChunk objects
|
|
30
|
+
"""
|
|
31
|
+
# Build element map: page -> List[Element objects]
|
|
32
|
+
page_elements = {}
|
|
33
|
+
for page in pages:
|
|
34
|
+
if hasattr(page, 'elements') and page.elements:
|
|
35
|
+
page_elements[page.number] = page.elements
|
|
36
|
+
|
|
37
|
+
blocks = self._split_into_blocks(pages=pages)
|
|
38
|
+
return self._chunk_blocks(blocks=blocks, header=header, page_elements=page_elements)
|
|
39
|
+
|
|
40
|
+
def chunk_text(self, text: str) -> List[str]:
|
|
41
|
+
"""Chunk a single text string into multiple chunks"""
|
|
42
|
+
from sec2md.models import Page
|
|
43
|
+
pages = [Page(number=0, content=text)]
|
|
44
|
+
chunks = self.split(pages=pages)
|
|
45
|
+
return [chunk.content for chunk in chunks]
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _split_into_blocks(pages: List[Any]):
|
|
49
|
+
"""Splits the pages into blocks.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
pages: List of Page objects
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of BaseBlock objects
|
|
56
|
+
"""
|
|
57
|
+
from sec2md.models import Page
|
|
58
|
+
|
|
59
|
+
blocks = []
|
|
60
|
+
table_content = ""
|
|
61
|
+
last_page = None
|
|
62
|
+
|
|
63
|
+
for page in pages:
|
|
64
|
+
last_page = page
|
|
65
|
+
|
|
66
|
+
for line in page.content.split('\n'):
|
|
67
|
+
if table_content and not MarkdownChunker._is_table_line(line):
|
|
68
|
+
blocks.append(TableBlock(content=table_content, page=page.number))
|
|
69
|
+
table_content = ""
|
|
70
|
+
|
|
71
|
+
if line.startswith("#"):
|
|
72
|
+
blocks.append(HeaderBlock(content=line, page=page.number))
|
|
73
|
+
|
|
74
|
+
elif MarkdownChunker._is_table_line(line):
|
|
75
|
+
table_content += f"{line}\n"
|
|
76
|
+
|
|
77
|
+
else:
|
|
78
|
+
blocks.append(TextBlock(content=line, page=page.number))
|
|
79
|
+
|
|
80
|
+
if table_content and last_page:
|
|
81
|
+
blocks.append(TableBlock(content=table_content, page=last_page.number))
|
|
82
|
+
|
|
83
|
+
return blocks
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def _is_table_line(line: str) -> bool:
|
|
87
|
+
import re
|
|
88
|
+
if '|' not in line:
|
|
89
|
+
return False
|
|
90
|
+
stripped = line.strip()
|
|
91
|
+
if not stripped:
|
|
92
|
+
return False
|
|
93
|
+
align_pattern = re.compile(r'^\s*:?-+:?\s*$')
|
|
94
|
+
cells = [c.strip() for c in stripped.strip('|').split('|')]
|
|
95
|
+
if all(align_pattern.match(c) for c in cells):
|
|
96
|
+
return True
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[MarkdownChunk]:
|
|
100
|
+
"""Converts the blocks to chunks"""
|
|
101
|
+
page_elements = page_elements or {}
|
|
102
|
+
chunks = []
|
|
103
|
+
chunk_blocks = []
|
|
104
|
+
num_tokens = 0
|
|
105
|
+
|
|
106
|
+
for i, block in enumerate(blocks):
|
|
107
|
+
next_block = blocks[i + 1] if i + 1 < len(blocks) else None
|
|
108
|
+
|
|
109
|
+
if block.block_type == 'Text':
|
|
110
|
+
chunk_blocks, num_tokens, chunks = self._process_text_block(
|
|
111
|
+
block, chunk_blocks, num_tokens, chunks, header, page_elements
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
elif block.block_type == 'Table':
|
|
115
|
+
chunk_blocks, num_tokens, chunks = self._process_table_block(
|
|
116
|
+
block, chunk_blocks, num_tokens, chunks, blocks, i, header, page_elements
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
chunk_blocks, num_tokens, chunks = self._process_header_table_block(
|
|
121
|
+
block, chunk_blocks, num_tokens, chunks, next_block, header, page_elements
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if chunk_blocks:
|
|
125
|
+
self._finalize_chunk(chunks, chunk_blocks, header, page_elements)
|
|
126
|
+
|
|
127
|
+
return chunks
|
|
128
|
+
|
|
129
|
+
def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
130
|
+
chunks: List[MarkdownChunk], header: str = None, page_elements: dict = None):
|
|
131
|
+
"""Process a text block by breaking it into sentences if needed"""
|
|
132
|
+
sentences = []
|
|
133
|
+
sentences_tokens = 0
|
|
134
|
+
|
|
135
|
+
for sentence in block.sentences:
|
|
136
|
+
if num_tokens + sentences_tokens + sentence.tokens > self.chunk_size:
|
|
137
|
+
if sentences:
|
|
138
|
+
new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
|
|
139
|
+
chunk_blocks.append(new_block)
|
|
140
|
+
num_tokens += sentences_tokens
|
|
141
|
+
|
|
142
|
+
chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header, page_elements=page_elements)
|
|
143
|
+
|
|
144
|
+
sentences = [sentence]
|
|
145
|
+
sentences_tokens = sentence.tokens
|
|
146
|
+
|
|
147
|
+
else:
|
|
148
|
+
sentences.append(sentence)
|
|
149
|
+
sentences_tokens += sentence.tokens
|
|
150
|
+
|
|
151
|
+
if sentences:
|
|
152
|
+
new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
|
|
153
|
+
chunk_blocks.append(new_block)
|
|
154
|
+
num_tokens += sentences_tokens
|
|
155
|
+
|
|
156
|
+
return chunk_blocks, num_tokens, chunks
|
|
157
|
+
|
|
158
|
+
def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
159
|
+
chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
|
|
160
|
+
"""Process a table block with optional header backtrack"""
|
|
161
|
+
context = []
|
|
162
|
+
context_tokens = 0
|
|
163
|
+
|
|
164
|
+
# Backtrack for header only if 1-2 short blocks precede
|
|
165
|
+
count = 0
|
|
166
|
+
for j in range(block_idx - 1, -1, -1):
|
|
167
|
+
prev = all_blocks[j]
|
|
168
|
+
if prev.page != block.page:
|
|
169
|
+
break
|
|
170
|
+
if prev.block_type == 'Header':
|
|
171
|
+
if context_tokens + prev.tokens <= 128:
|
|
172
|
+
context.insert(0, prev)
|
|
173
|
+
context_tokens += prev.tokens
|
|
174
|
+
break
|
|
175
|
+
elif prev.block_type == 'Text' and prev.content.strip():
|
|
176
|
+
count += 1
|
|
177
|
+
if count > 2:
|
|
178
|
+
break
|
|
179
|
+
if context_tokens + prev.tokens <= 128:
|
|
180
|
+
context.insert(0, prev)
|
|
181
|
+
context_tokens += prev.tokens
|
|
182
|
+
else:
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
if num_tokens + context_tokens + block.tokens > self.chunk_size:
|
|
186
|
+
if chunk_blocks:
|
|
187
|
+
chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header, page_elements=page_elements)
|
|
188
|
+
|
|
189
|
+
# If we're backtracking context and the last chunk is ONLY that context, remove it
|
|
190
|
+
if context and chunks and len(chunks[-1].blocks) == len(context):
|
|
191
|
+
if all(chunks[-1].blocks[i] == context[i] for i in range(len(context))):
|
|
192
|
+
chunks.pop()
|
|
193
|
+
|
|
194
|
+
chunk_blocks = context + [block]
|
|
195
|
+
num_tokens = context_tokens + block.tokens
|
|
196
|
+
else:
|
|
197
|
+
chunk_blocks.extend(context + [block])
|
|
198
|
+
num_tokens += context_tokens + block.tokens
|
|
199
|
+
|
|
200
|
+
return chunk_blocks, num_tokens, chunks
|
|
201
|
+
|
|
202
|
+
def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
203
|
+
chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
|
|
204
|
+
"""Process a header block"""
|
|
205
|
+
if not chunk_blocks:
|
|
206
|
+
chunk_blocks.append(block)
|
|
207
|
+
num_tokens += block.tokens
|
|
208
|
+
return chunk_blocks, num_tokens, chunks
|
|
209
|
+
|
|
210
|
+
# Don't split if current content is small and next is a table
|
|
211
|
+
if next_block and next_block.block_type == 'Table' and num_tokens < self.chunk_overlap:
|
|
212
|
+
chunk_blocks.append(block)
|
|
213
|
+
num_tokens += block.tokens
|
|
214
|
+
return chunk_blocks, num_tokens, chunks
|
|
215
|
+
|
|
216
|
+
if num_tokens + block.tokens > self.chunk_size:
|
|
217
|
+
chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header, page_elements=page_elements)
|
|
218
|
+
chunk_blocks.append(block)
|
|
219
|
+
num_tokens += block.tokens
|
|
220
|
+
else:
|
|
221
|
+
chunk_blocks.append(block)
|
|
222
|
+
num_tokens += block.tokens
|
|
223
|
+
|
|
224
|
+
return chunk_blocks, num_tokens, chunks
|
|
225
|
+
|
|
226
|
+
def _finalize_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str, page_elements: dict):
|
|
227
|
+
"""Create chunk with elements from the pages it spans"""
|
|
228
|
+
chunk_pages = set(block.page for block in blocks)
|
|
229
|
+
elements = []
|
|
230
|
+
for page_num in sorted(chunk_pages):
|
|
231
|
+
if page_num in page_elements:
|
|
232
|
+
elements.extend(page_elements[page_num])
|
|
233
|
+
chunks.append(MarkdownChunk(blocks=blocks, header=header, elements=elements))
|
|
234
|
+
|
|
235
|
+
def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
|
|
236
|
+
List[MarkdownChunk], List[BaseBlock], int]:
|
|
237
|
+
"""Creates a chunk and returns overlap blocks"""
|
|
238
|
+
page_elements = page_elements or {}
|
|
239
|
+
self._finalize_chunk(chunks, blocks, header, page_elements)
|
|
240
|
+
|
|
241
|
+
if not self.chunk_overlap:
|
|
242
|
+
return chunks, [], 0
|
|
243
|
+
|
|
244
|
+
overlap_tokens = 0
|
|
245
|
+
overlap_blocks = []
|
|
246
|
+
|
|
247
|
+
for block in reversed(blocks):
|
|
248
|
+
if block.block_type == "Text":
|
|
249
|
+
sentences = []
|
|
250
|
+
|
|
251
|
+
for sentence in reversed(block.sentences):
|
|
252
|
+
|
|
253
|
+
if overlap_tokens + sentence.tokens > self.chunk_overlap:
|
|
254
|
+
text_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
|
|
255
|
+
overlap_blocks.insert(0, text_block)
|
|
256
|
+
return chunks, overlap_blocks, overlap_tokens
|
|
257
|
+
|
|
258
|
+
else:
|
|
259
|
+
sentences.insert(0, sentence)
|
|
260
|
+
overlap_tokens += sentence.tokens
|
|
261
|
+
|
|
262
|
+
else:
|
|
263
|
+
if overlap_tokens + block.tokens > self.chunk_overlap:
|
|
264
|
+
return chunks, overlap_blocks, overlap_tokens
|
|
265
|
+
|
|
266
|
+
else:
|
|
267
|
+
overlap_blocks.insert(0, block)
|
|
268
|
+
overlap_tokens += block.tokens
|
|
269
|
+
|
|
270
|
+
return chunks, [], 0
|