sec2md 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

@@ -0,0 +1,135 @@
1
+ import re
2
+ from typing import List
3
+ from pydantic import BaseModel, Field, computed_field
4
+
5
+ try:
6
+ import tiktoken
7
+ TIKTOKEN_AVAILABLE = True
8
+ except ImportError:
9
+ TIKTOKEN_AVAILABLE = False
10
+
11
+
12
+ def estimate_tokens(text: str) -> int:
13
+ """
14
+ Calculate token count for text.
15
+
16
+ Uses tiktoken with cl100k_base encoding (gpt-3.5-turbo/gpt-4) if available.
17
+ Falls back to character/4 heuristic if tiktoken is not installed.
18
+ """
19
+ if TIKTOKEN_AVAILABLE:
20
+ encoding = tiktoken.get_encoding("cl100k_base")
21
+ return len(encoding.encode(text))
22
+ else:
23
+ # Fallback: simple heuristic
24
+ return max(1, len(text) // 4)
25
+
26
+
27
+ def split_sentences(text: str) -> List[str]:
28
+ """Simple regex-based sentence splitter"""
29
+ # Split on .!? followed by whitespace and capital letter or end of string
30
+ # Handles common abbreviations like Mr., Dr., Inc., etc.
31
+ sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
32
+ return [s.strip() for s in sentences if s.strip()]
33
+
34
+
35
+ class BaseBlock(BaseModel):
36
+ """Base class for markdown blocks."""
37
+ block_type: str = Field(..., description="Type of markdown block")
38
+ content: str = Field(..., description="Block content")
39
+ page: int = Field(..., description="Page number")
40
+
41
+ model_config = {"frozen": False}
42
+
43
+ @computed_field
44
+ @property
45
+ def tokens(self) -> int:
46
+ return estimate_tokens(self.content)
47
+
48
+
49
+ class Sentence(BaseModel):
50
+ """Sentence within a text block."""
51
+ content: str = Field(..., description="Sentence content")
52
+
53
+ model_config = {"frozen": False}
54
+
55
+ @computed_field
56
+ @property
57
+ def tokens(self) -> int:
58
+ return estimate_tokens(self.content)
59
+
60
+
61
+ class TextBlock(BaseBlock):
62
+ block_type: str = Field(default='Text', description="Text block type")
63
+
64
+ @computed_field
65
+ @property
66
+ def sentences(self) -> List[Sentence]:
67
+ """Returns the text block sentences"""
68
+ return [Sentence(content=content) for content in split_sentences(self.content)]
69
+
70
+ @classmethod
71
+ def from_sentences(cls, sentences: List[Sentence], page: int):
72
+ content = " ".join([sentence.content for sentence in sentences])
73
+ return cls(content=content, page=page, block_type='Text')
74
+
75
+
76
+ class AudioParagraphBlock(BaseBlock):
77
+ block_type: str = Field(default="Text", description="Audio paragraph block type")
78
+ paragraph_id: int = Field(..., description="Paragraph ID")
79
+ audio_start: float = Field(..., description="Audio start time")
80
+ audio_end: float = Field(..., description="Audio end time")
81
+
82
+ @computed_field
83
+ @property
84
+ def sentences(self) -> List[Sentence]:
85
+ """Returns the text block sentences"""
86
+ return [Sentence(content=content) for content in split_sentences(self.content)]
87
+
88
+ def format(self) -> dict:
89
+ """Formats the audio paragraphs"""
90
+ return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
91
+
92
+
93
+ class TableBlock(BaseModel):
94
+ block_type: str = Field(default='Table', description="Table block type")
95
+ content: str = Field(..., description="Table content")
96
+ page: int = Field(..., description="Page number")
97
+
98
+ model_config = {"frozen": False}
99
+
100
+ def __init__(self, **data):
101
+ if 'content' in data:
102
+ data['content'] = self._to_minified_markdown_static(data['content'])
103
+ super().__init__(**data)
104
+
105
+ @computed_field
106
+ @property
107
+ def tokens(self) -> int:
108
+ return estimate_tokens(self.content)
109
+
110
+ @staticmethod
111
+ def _to_minified_markdown_static(content: str) -> str:
112
+ """Returns the table in a Minified Markdown format"""
113
+ lines = content.split('\n')
114
+ cleaned_lines = []
115
+
116
+ for i, line in enumerate(lines):
117
+ if not line.strip():
118
+ continue
119
+
120
+ parts = line.split('|')
121
+ cleaned_parts = [re.sub(r'\s+', ' ', part.strip()) for part in parts]
122
+ cleaned_line = '|'.join(cleaned_parts)
123
+
124
+ if i == 1:
125
+ num_cols = len(cleaned_parts) - 1
126
+ separator = '|' + '|'.join(['---'] * num_cols) + '|'
127
+ cleaned_lines.append(separator)
128
+ else:
129
+ cleaned_lines.append(cleaned_line)
130
+
131
+ return '\n'.join(cleaned_lines)
132
+
133
+
134
+ class HeaderBlock(BaseBlock):
135
+ block_type: str = Field(default='Header', description="Header block type")
@@ -0,0 +1,133 @@
1
+ from typing import List, Optional, Tuple, TYPE_CHECKING
2
+ from pydantic import BaseModel, Field, computed_field
3
+
4
+ from sec2md.chunker.markdown_blocks import BaseBlock
5
+
6
+ if TYPE_CHECKING:
7
+ from sec2md.models import Element
8
+ else:
9
+ Element = 'Element' # Forward reference for Pydantic
10
+
11
+
12
+ class MarkdownChunk(BaseModel):
13
+ """Represents a chunk of markdown content that can be embedded"""
14
+
15
+ blocks: List[BaseBlock] = Field(..., description="List of markdown blocks in this chunk")
16
+ header: Optional[str] = Field(None, description="Optional header for embedding context")
17
+ elements: List['Element'] = Field(default_factory=list, description="Element objects for citation")
18
+ vector: Optional[List[float]] = Field(None, description="Vector embedding for this chunk")
19
+
20
+ model_config = {"frozen": False, "arbitrary_types_allowed": True}
21
+
22
+ @computed_field
23
+ @property
24
+ def page(self) -> int:
25
+ """First page (for backward compatibility)."""
26
+ return self.blocks[0].page if self.blocks else 1
27
+
28
+ def set_vector(self, vector: List[float]):
29
+ """Set the vector embedding for this chunk"""
30
+ self.vector = vector
31
+
32
+ @computed_field
33
+ @property
34
+ def start_page(self) -> int:
35
+ """First page this chunk appears on (from elements or blocks)."""
36
+ # Prefer elements since they have actual page info from the document
37
+ if self.elements:
38
+ return min(e.page_start for e in self.elements)
39
+ elif self.blocks:
40
+ return min(block.page for block in self.blocks)
41
+ return self.page
42
+
43
+ @computed_field
44
+ @property
45
+ def end_page(self) -> int:
46
+ """Last page this chunk appears on (from elements or blocks)."""
47
+ # Prefer elements since they have actual page info from the document
48
+ if self.elements:
49
+ return max(e.page_end for e in self.elements)
50
+ elif self.blocks:
51
+ return max(block.page for block in self.blocks)
52
+ return self.page
53
+
54
+ @computed_field
55
+ @property
56
+ def page_range(self) -> Tuple[int, int]:
57
+ """(start_page, end_page) tuple."""
58
+ return (self.start_page, self.end_page)
59
+
60
+ @computed_field
61
+ @property
62
+ def content(self) -> str:
63
+ """Get the text content of this chunk"""
64
+ return "\n".join([block.content for block in self.blocks])
65
+
66
+ @computed_field
67
+ @property
68
+ def data(self) -> List[dict]:
69
+ """Returns a list of block data grouped by page with ONLY the chunk's content"""
70
+ page_blocks = {}
71
+
72
+ for block in self.blocks:
73
+ if block.page not in page_blocks:
74
+ page_blocks[block.page] = []
75
+ page_blocks[block.page].append(block)
76
+
77
+ page_content_data = []
78
+ for page, blocks in page_blocks.items():
79
+ # Only include the content from blocks in THIS chunk, not full page content
80
+ page_content = "\n".join(block.content for block in blocks)
81
+ if not page_content.strip():
82
+ continue
83
+
84
+ page_content_data.append({
85
+ "page": page,
86
+ "content": page_content
87
+ })
88
+
89
+ return sorted(page_content_data, key=lambda x: x["page"])
90
+
91
+ @computed_field
92
+ @property
93
+ def pages(self) -> List[dict]:
94
+ """Returns a list of pages with ONLY this chunk's content (not full page content)"""
95
+ return self.data
96
+
97
+ @computed_field
98
+ @property
99
+ def embedding_text(self) -> str:
100
+ """Get the text to use for embedding, with optional header prepended"""
101
+ if self.header:
102
+ return f"{self.header}\n\n...\n\n{self.content}"
103
+ return self.content
104
+
105
+ @computed_field
106
+ @property
107
+ def has_table(self) -> bool:
108
+ """Returns True if this chunk contains one or more table blocks"""
109
+ return any(block.block_type == 'Table' for block in self.blocks)
110
+
111
+ @computed_field
112
+ @property
113
+ def num_tokens(self) -> int:
114
+ """Returns the total number of tokens in this chunk"""
115
+ return sum(block.tokens for block in self.blocks)
116
+
117
+ @computed_field
118
+ @property
119
+ def element_ids(self) -> List[str]:
120
+ """List of element IDs for citations."""
121
+ return [e.id for e in self.elements] if self.elements else []
122
+
123
+ def to_dict(self) -> dict:
124
+ """Alias for model_dump() - kept for backward compat during alpha."""
125
+ return self.model_dump()
126
+
127
+ def __repr__(self):
128
+ pages_str = f"{self.start_page}-{self.end_page}" if self.start_page != self.end_page else str(self.start_page)
129
+ return f"MarkdownChunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
130
+
131
+ def _repr_markdown_(self):
132
+ """This method is called by IPython to display as Markdown"""
133
+ return self.content
@@ -0,0 +1,270 @@
1
+ import logging
2
+ from typing import Union, Tuple, List, Dict, Any
3
+
4
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
5
+ from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
6
+
7
+ # Rebuild MarkdownChunk after Element is defined
8
+ from sec2md.models import Element
9
+ MarkdownChunk.model_rebuild()
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class MarkdownChunker:
15
+ """Splits markdown content into chunks"""
16
+
17
+ def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
18
+ self.chunk_size = chunk_size
19
+ self.chunk_overlap = chunk_overlap
20
+
21
+ def split(self, pages: List[Any], header: str = None) -> List[MarkdownChunk]:
22
+ """Split the pages into chunks with optional header for embedding context.
23
+
24
+ Args:
25
+ pages: List of Page objects
26
+ header: Optional header to prepend to each chunk's embedding_text
27
+
28
+ Returns:
29
+ List of MarkdownChunk objects
30
+ """
31
+ # Build element map: page -> List[Element objects]
32
+ page_elements = {}
33
+ for page in pages:
34
+ if hasattr(page, 'elements') and page.elements:
35
+ page_elements[page.number] = page.elements
36
+
37
+ blocks = self._split_into_blocks(pages=pages)
38
+ return self._chunk_blocks(blocks=blocks, header=header, page_elements=page_elements)
39
+
40
+ def chunk_text(self, text: str) -> List[str]:
41
+ """Chunk a single text string into multiple chunks"""
42
+ from sec2md.models import Page
43
+ pages = [Page(number=0, content=text)]
44
+ chunks = self.split(pages=pages)
45
+ return [chunk.content for chunk in chunks]
46
+
47
+ @staticmethod
48
+ def _split_into_blocks(pages: List[Any]):
49
+ """Splits the pages into blocks.
50
+
51
+ Args:
52
+ pages: List of Page objects
53
+
54
+ Returns:
55
+ List of BaseBlock objects
56
+ """
57
+ from sec2md.models import Page
58
+
59
+ blocks = []
60
+ table_content = ""
61
+ last_page = None
62
+
63
+ for page in pages:
64
+ last_page = page
65
+
66
+ for line in page.content.split('\n'):
67
+ if table_content and not MarkdownChunker._is_table_line(line):
68
+ blocks.append(TableBlock(content=table_content, page=page.number))
69
+ table_content = ""
70
+
71
+ if line.startswith("#"):
72
+ blocks.append(HeaderBlock(content=line, page=page.number))
73
+
74
+ elif MarkdownChunker._is_table_line(line):
75
+ table_content += f"{line}\n"
76
+
77
+ else:
78
+ blocks.append(TextBlock(content=line, page=page.number))
79
+
80
+ if table_content and last_page:
81
+ blocks.append(TableBlock(content=table_content, page=last_page.number))
82
+
83
+ return blocks
84
+
85
+ @staticmethod
86
+ def _is_table_line(line: str) -> bool:
87
+ import re
88
+ if '|' not in line:
89
+ return False
90
+ stripped = line.strip()
91
+ if not stripped:
92
+ return False
93
+ align_pattern = re.compile(r'^\s*:?-+:?\s*$')
94
+ cells = [c.strip() for c in stripped.strip('|').split('|')]
95
+ if all(align_pattern.match(c) for c in cells):
96
+ return True
97
+ return True
98
+
99
+ def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[MarkdownChunk]:
100
+ """Converts the blocks to chunks"""
101
+ page_elements = page_elements or {}
102
+ chunks = []
103
+ chunk_blocks = []
104
+ num_tokens = 0
105
+
106
+ for i, block in enumerate(blocks):
107
+ next_block = blocks[i + 1] if i + 1 < len(blocks) else None
108
+
109
+ if block.block_type == 'Text':
110
+ chunk_blocks, num_tokens, chunks = self._process_text_block(
111
+ block, chunk_blocks, num_tokens, chunks, header, page_elements
112
+ )
113
+
114
+ elif block.block_type == 'Table':
115
+ chunk_blocks, num_tokens, chunks = self._process_table_block(
116
+ block, chunk_blocks, num_tokens, chunks, blocks, i, header, page_elements
117
+ )
118
+
119
+ else:
120
+ chunk_blocks, num_tokens, chunks = self._process_header_table_block(
121
+ block, chunk_blocks, num_tokens, chunks, next_block, header, page_elements
122
+ )
123
+
124
+ if chunk_blocks:
125
+ self._finalize_chunk(chunks, chunk_blocks, header, page_elements)
126
+
127
+ return chunks
128
+
129
+ def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
130
+ chunks: List[MarkdownChunk], header: str = None, page_elements: dict = None):
131
+ """Process a text block by breaking it into sentences if needed"""
132
+ sentences = []
133
+ sentences_tokens = 0
134
+
135
+ for sentence in block.sentences:
136
+ if num_tokens + sentences_tokens + sentence.tokens > self.chunk_size:
137
+ if sentences:
138
+ new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
139
+ chunk_blocks.append(new_block)
140
+ num_tokens += sentences_tokens
141
+
142
+ chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header, page_elements=page_elements)
143
+
144
+ sentences = [sentence]
145
+ sentences_tokens = sentence.tokens
146
+
147
+ else:
148
+ sentences.append(sentence)
149
+ sentences_tokens += sentence.tokens
150
+
151
+ if sentences:
152
+ new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
153
+ chunk_blocks.append(new_block)
154
+ num_tokens += sentences_tokens
155
+
156
+ return chunk_blocks, num_tokens, chunks
157
+
158
+ def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
159
+ chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
160
+ """Process a table block with optional header backtrack"""
161
+ context = []
162
+ context_tokens = 0
163
+
164
+ # Backtrack for header only if 1-2 short blocks precede
165
+ count = 0
166
+ for j in range(block_idx - 1, -1, -1):
167
+ prev = all_blocks[j]
168
+ if prev.page != block.page:
169
+ break
170
+ if prev.block_type == 'Header':
171
+ if context_tokens + prev.tokens <= 128:
172
+ context.insert(0, prev)
173
+ context_tokens += prev.tokens
174
+ break
175
+ elif prev.block_type == 'Text' and prev.content.strip():
176
+ count += 1
177
+ if count > 2:
178
+ break
179
+ if context_tokens + prev.tokens <= 128:
180
+ context.insert(0, prev)
181
+ context_tokens += prev.tokens
182
+ else:
183
+ break
184
+
185
+ if num_tokens + context_tokens + block.tokens > self.chunk_size:
186
+ if chunk_blocks:
187
+ chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header, page_elements=page_elements)
188
+
189
+ # If we're backtracking context and the last chunk is ONLY that context, remove it
190
+ if context and chunks and len(chunks[-1].blocks) == len(context):
191
+ if all(chunks[-1].blocks[i] == context[i] for i in range(len(context))):
192
+ chunks.pop()
193
+
194
+ chunk_blocks = context + [block]
195
+ num_tokens = context_tokens + block.tokens
196
+ else:
197
+ chunk_blocks.extend(context + [block])
198
+ num_tokens += context_tokens + block.tokens
199
+
200
+ return chunk_blocks, num_tokens, chunks
201
+
202
+ def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
203
+ chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
204
+ """Process a header block"""
205
+ if not chunk_blocks:
206
+ chunk_blocks.append(block)
207
+ num_tokens += block.tokens
208
+ return chunk_blocks, num_tokens, chunks
209
+
210
+ # Don't split if current content is small and next is a table
211
+ if next_block and next_block.block_type == 'Table' and num_tokens < self.chunk_overlap:
212
+ chunk_blocks.append(block)
213
+ num_tokens += block.tokens
214
+ return chunk_blocks, num_tokens, chunks
215
+
216
+ if num_tokens + block.tokens > self.chunk_size:
217
+ chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header, page_elements=page_elements)
218
+ chunk_blocks.append(block)
219
+ num_tokens += block.tokens
220
+ else:
221
+ chunk_blocks.append(block)
222
+ num_tokens += block.tokens
223
+
224
+ return chunk_blocks, num_tokens, chunks
225
+
226
+ def _finalize_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str, page_elements: dict):
227
+ """Create chunk with elements from the pages it spans"""
228
+ chunk_pages = set(block.page for block in blocks)
229
+ elements = []
230
+ for page_num in sorted(chunk_pages):
231
+ if page_num in page_elements:
232
+ elements.extend(page_elements[page_num])
233
+ chunks.append(MarkdownChunk(blocks=blocks, header=header, elements=elements))
234
+
235
+ def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
236
+ List[MarkdownChunk], List[BaseBlock], int]:
237
+ """Creates a chunk and returns overlap blocks"""
238
+ page_elements = page_elements or {}
239
+ self._finalize_chunk(chunks, blocks, header, page_elements)
240
+
241
+ if not self.chunk_overlap:
242
+ return chunks, [], 0
243
+
244
+ overlap_tokens = 0
245
+ overlap_blocks = []
246
+
247
+ for block in reversed(blocks):
248
+ if block.block_type == "Text":
249
+ sentences = []
250
+
251
+ for sentence in reversed(block.sentences):
252
+
253
+ if overlap_tokens + sentence.tokens > self.chunk_overlap:
254
+ text_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
255
+ overlap_blocks.insert(0, text_block)
256
+ return chunks, overlap_blocks, overlap_tokens
257
+
258
+ else:
259
+ sentences.insert(0, sentence)
260
+ overlap_tokens += sentence.tokens
261
+
262
+ else:
263
+ if overlap_tokens + block.tokens > self.chunk_overlap:
264
+ return chunks, overlap_blocks, overlap_tokens
265
+
266
+ else:
267
+ overlap_blocks.insert(0, block)
268
+ overlap_tokens += block.tokens
269
+
270
+ return chunks, [], 0