sec2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +24 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +116 -0
- sec2md/chunker/markdown_chunk.py +76 -0
- sec2md/chunker/markdown_chunker.py +234 -0
- sec2md/chunking.py +66 -0
- sec2md/core.py +93 -0
- sec2md/models.py +153 -0
- sec2md/parser.py +586 -0
- sec2md/section_extractor.py +316 -0
- sec2md/sections.py +104 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.0.dist-info/METADATA +217 -0
- sec2md-0.1.0.dist-info/RECORD +19 -0
- sec2md-0.1.0.dist-info/WHEEL +5 -0
- sec2md-0.1.0.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def estimate_tokens(text: str) -> int:
|
|
7
|
+
"""
|
|
8
|
+
Estimate token count using character/4 heuristic.
|
|
9
|
+
|
|
10
|
+
This is a simple approximation. For exact token counting,
|
|
11
|
+
use your embedding provider's tokenizer.
|
|
12
|
+
"""
|
|
13
|
+
return max(1, len(text) // 4)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def split_sentences(text: str) -> List[str]:
|
|
17
|
+
"""Simple regex-based sentence splitter"""
|
|
18
|
+
# Split on .!? followed by whitespace and capital letter or end of string
|
|
19
|
+
# Handles common abbreviations like Mr., Dr., Inc., etc.
|
|
20
|
+
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
|
|
21
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BaseBlock(ABC):
|
|
25
|
+
block_type: str
|
|
26
|
+
|
|
27
|
+
def __init__(self, content: str, page: int):
|
|
28
|
+
self.content = content
|
|
29
|
+
self.page = page
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def tokens(self) -> int:
|
|
33
|
+
return estimate_tokens(self.content)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Sentence:
|
|
37
|
+
|
|
38
|
+
def __init__(self, content: str):
|
|
39
|
+
self.content = content
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def tokens(self) -> int:
|
|
43
|
+
return estimate_tokens(self.content)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TextBlock(BaseBlock):
|
|
47
|
+
block_type: str = 'Text'
|
|
48
|
+
|
|
49
|
+
def __init__(self, content: str, page: int):
|
|
50
|
+
super().__init__(content=content, page=page)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def sentences(self) -> List[Sentence]:
|
|
54
|
+
"""Returns the text block sentences"""
|
|
55
|
+
return [Sentence(content=content) for content in split_sentences(self.content)]
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def from_sentences(cls, sentences: List[Sentence], page: int):
|
|
59
|
+
content = " ".join([sentence.content for sentence in sentences])
|
|
60
|
+
return cls(content=content, page=page)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class AudioParagraphBlock(BaseBlock):
|
|
64
|
+
block_type: str = "Text"
|
|
65
|
+
|
|
66
|
+
def __init__(self, content: str, page: int, paragraph_id: int, audio_start: float, audio_end: float):
|
|
67
|
+
super().__init__(content=content, page=page)
|
|
68
|
+
self.paragraph_id = paragraph_id
|
|
69
|
+
self.audio_start = audio_start
|
|
70
|
+
self.audio_end = audio_end
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def sentences(self) -> List[Sentence]:
|
|
74
|
+
"""Returns the text block sentences"""
|
|
75
|
+
return [Sentence(content=content) for content in split_sentences(self.content)]
|
|
76
|
+
|
|
77
|
+
def format(self) -> dict:
|
|
78
|
+
"""Formats the audio paragraphs"""
|
|
79
|
+
return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class TableBlock(BaseBlock):
|
|
83
|
+
block_type: str = 'Table'
|
|
84
|
+
|
|
85
|
+
def __init__(self, content: str, page: int):
|
|
86
|
+
super().__init__(content=content, page=page)
|
|
87
|
+
self.content = self._to_minified_markdown()
|
|
88
|
+
|
|
89
|
+
def _to_minified_markdown(self) -> str:
|
|
90
|
+
"""Returns the table in a Minified Markdown format"""
|
|
91
|
+
lines = self.content.split('\n')
|
|
92
|
+
cleaned_lines = []
|
|
93
|
+
|
|
94
|
+
for i, line in enumerate(lines):
|
|
95
|
+
if not line.strip():
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
parts = line.split('|')
|
|
99
|
+
cleaned_parts = [re.sub(r'\s+', ' ', part.strip()) for part in parts]
|
|
100
|
+
cleaned_line = '|'.join(cleaned_parts)
|
|
101
|
+
|
|
102
|
+
if i == 1:
|
|
103
|
+
num_cols = len(cleaned_parts) - 1
|
|
104
|
+
separator = '|' + '|'.join(['---'] * num_cols) + '|'
|
|
105
|
+
cleaned_lines.append(separator)
|
|
106
|
+
else:
|
|
107
|
+
cleaned_lines.append(cleaned_line)
|
|
108
|
+
|
|
109
|
+
return '\n'.join(cleaned_lines)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class HeaderBlock(BaseBlock):
|
|
113
|
+
block_type = 'Header'
|
|
114
|
+
|
|
115
|
+
def __init__(self, content: str, page: int):
|
|
116
|
+
super().__init__(content=content, page=page)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from sec2md.chunker.markdown_blocks import BaseBlock
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MarkdownChunk:
|
|
7
|
+
"""Represents a chunk of markdown content that can be embedded"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, blocks: List[BaseBlock], header: Optional[str] = None):
|
|
10
|
+
"""Initialize a markdown chunk with blocks and optional header for embedding"""
|
|
11
|
+
self.vector: Optional[List[float]] = None
|
|
12
|
+
self.blocks = blocks
|
|
13
|
+
self.page = blocks[0].page
|
|
14
|
+
self.header = header
|
|
15
|
+
|
|
16
|
+
def set_vector(self, vector: List[float]):
|
|
17
|
+
"""Set the vector embedding for this chunk"""
|
|
18
|
+
self.vector = vector
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def content(self) -> str:
|
|
22
|
+
"""Get the text content of this chunk"""
|
|
23
|
+
return "\n".join([block.content for block in self.blocks])
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def data(self) -> List[dict]:
|
|
27
|
+
"""Returns a list of block data grouped by page with ONLY the chunk's content"""
|
|
28
|
+
page_blocks = {}
|
|
29
|
+
|
|
30
|
+
for block in self.blocks:
|
|
31
|
+
if block.page not in page_blocks:
|
|
32
|
+
page_blocks[block.page] = []
|
|
33
|
+
page_blocks[block.page].append(block)
|
|
34
|
+
|
|
35
|
+
page_content_data = []
|
|
36
|
+
for page, blocks in page_blocks.items():
|
|
37
|
+
# Only include the content from blocks in THIS chunk, not full page content
|
|
38
|
+
page_content = "\n".join(block.content for block in blocks)
|
|
39
|
+
if not page_content.strip():
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
page_content_data.append({
|
|
43
|
+
"page": page,
|
|
44
|
+
"content": page_content
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
return sorted(page_content_data, key=lambda x: x["page"])
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def pages(self) -> List[dict]:
|
|
51
|
+
"""Returns a list of pages with ONLY this chunk's content (not full page content)"""
|
|
52
|
+
return self.data
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def embedding_text(self) -> str:
|
|
56
|
+
"""Get the text to use for embedding, with optional header prepended"""
|
|
57
|
+
if self.header:
|
|
58
|
+
return f"{self.header}\n\n...\n\n{self.content}"
|
|
59
|
+
return self.content
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def has_table(self) -> bool:
|
|
63
|
+
"""Returns True if this chunk contains one or more table blocks"""
|
|
64
|
+
return any(block.block_type == 'Table' for block in self.blocks)
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def num_tokens(self) -> int:
|
|
68
|
+
"""Returns the total number of tokens in this chunk"""
|
|
69
|
+
return sum(block.tokens for block in self.blocks)
|
|
70
|
+
|
|
71
|
+
def __repr__(self):
|
|
72
|
+
return f"MarkdownChunk(page={self.page}, blocks={len(self.blocks)})"
|
|
73
|
+
|
|
74
|
+
def _repr_markdown_(self):
|
|
75
|
+
"""This method is called by IPython to display as Markdown"""
|
|
76
|
+
return self.content
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Union, Tuple, List, Dict
|
|
3
|
+
|
|
4
|
+
from sec2md.chunker.markdown_chunk import MarkdownChunk
|
|
5
|
+
from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MarkdownChunker:
|
|
11
|
+
"""Splits markdown content into chunks"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
|
|
14
|
+
self.chunk_size = chunk_size
|
|
15
|
+
self.chunk_overlap = chunk_overlap
|
|
16
|
+
|
|
17
|
+
def split(self, pages: List[Dict[str, Union[int, str]]], header: str = None) -> List[MarkdownChunk]:
|
|
18
|
+
"""Split the pages into chunks with optional header for embedding context"""
|
|
19
|
+
blocks = self._split_into_blocks(pages=pages)
|
|
20
|
+
return self._chunk_blocks(blocks=blocks, header=header)
|
|
21
|
+
|
|
22
|
+
def chunk_text(self, text: str) -> List[str]:
|
|
23
|
+
"""Chunk a single text string into multiple chunks"""
|
|
24
|
+
pages = [{"page": 0, "content": text}]
|
|
25
|
+
chunks = self.split(pages=pages)
|
|
26
|
+
return [chunk.content for chunk in chunks]
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def _split_into_blocks(pages: List[Dict[str, Union[int, str]]]):
|
|
30
|
+
"""Splits the page into blocks"""
|
|
31
|
+
blocks = []
|
|
32
|
+
table_content = ""
|
|
33
|
+
last_page = None
|
|
34
|
+
|
|
35
|
+
for page in pages:
|
|
36
|
+
last_page = page['page']
|
|
37
|
+
for line in page['content'].split('\n'):
|
|
38
|
+
if table_content and not MarkdownChunker._is_table_line(line):
|
|
39
|
+
block = TableBlock(content=table_content, page=page['page'])
|
|
40
|
+
blocks.append(block)
|
|
41
|
+
table_content = ""
|
|
42
|
+
|
|
43
|
+
if line.startswith("#"):
|
|
44
|
+
block = HeaderBlock(content=line, page=page['page'])
|
|
45
|
+
blocks.append(block)
|
|
46
|
+
|
|
47
|
+
elif MarkdownChunker._is_table_line(line):
|
|
48
|
+
table_content += f"{line}\n"
|
|
49
|
+
|
|
50
|
+
else:
|
|
51
|
+
block = TextBlock(content=line, page=page['page'])
|
|
52
|
+
blocks.append(block)
|
|
53
|
+
|
|
54
|
+
if table_content and last_page is not None:
|
|
55
|
+
block = TableBlock(content=table_content, page=last_page)
|
|
56
|
+
blocks.append(block)
|
|
57
|
+
|
|
58
|
+
return blocks
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _is_table_line(line: str) -> bool:
|
|
62
|
+
import re
|
|
63
|
+
if '|' not in line:
|
|
64
|
+
return False
|
|
65
|
+
stripped = line.strip()
|
|
66
|
+
if not stripped:
|
|
67
|
+
return False
|
|
68
|
+
align_pattern = re.compile(r'^\s*:?-+:?\s*$')
|
|
69
|
+
cells = [c.strip() for c in stripped.strip('|').split('|')]
|
|
70
|
+
if all(align_pattern.match(c) for c in cells):
|
|
71
|
+
return True
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None) -> List[MarkdownChunk]:
|
|
75
|
+
"""Converts the blocks to chunks"""
|
|
76
|
+
chunks = []
|
|
77
|
+
chunk_blocks = []
|
|
78
|
+
num_tokens = 0
|
|
79
|
+
|
|
80
|
+
for i, block in enumerate(blocks):
|
|
81
|
+
next_block = blocks[i + 1] if i + 1 < len(blocks) else None
|
|
82
|
+
|
|
83
|
+
if block.block_type == 'Text':
|
|
84
|
+
chunk_blocks, num_tokens, chunks = self._process_text_block(
|
|
85
|
+
block, chunk_blocks, num_tokens, chunks, header
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
elif block.block_type == 'Table':
|
|
89
|
+
chunk_blocks, num_tokens, chunks = self._process_table_block(
|
|
90
|
+
block, chunk_blocks, num_tokens, chunks, blocks, i, header
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
else:
|
|
94
|
+
chunk_blocks, num_tokens, chunks = self._process_header_table_block(
|
|
95
|
+
block, chunk_blocks, num_tokens, chunks, next_block, header
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if chunk_blocks:
|
|
99
|
+
chunks.append(MarkdownChunk(blocks=chunk_blocks, header=header))
|
|
100
|
+
|
|
101
|
+
return chunks
|
|
102
|
+
|
|
103
|
+
def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
104
|
+
chunks: List[MarkdownChunk], header: str = None):
|
|
105
|
+
"""Process a text block by breaking it into sentences if needed"""
|
|
106
|
+
sentences = []
|
|
107
|
+
sentences_tokens = 0
|
|
108
|
+
|
|
109
|
+
for sentence in block.sentences:
|
|
110
|
+
if num_tokens + sentences_tokens + sentence.tokens > self.chunk_size:
|
|
111
|
+
if sentences:
|
|
112
|
+
new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
|
|
113
|
+
chunk_blocks.append(new_block)
|
|
114
|
+
num_tokens += sentences_tokens
|
|
115
|
+
|
|
116
|
+
chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header)
|
|
117
|
+
|
|
118
|
+
sentences = [sentence]
|
|
119
|
+
sentences_tokens = sentence.tokens
|
|
120
|
+
|
|
121
|
+
else:
|
|
122
|
+
sentences.append(sentence)
|
|
123
|
+
sentences_tokens += sentence.tokens
|
|
124
|
+
|
|
125
|
+
if sentences:
|
|
126
|
+
new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
|
|
127
|
+
chunk_blocks.append(new_block)
|
|
128
|
+
num_tokens += sentences_tokens
|
|
129
|
+
|
|
130
|
+
return chunk_blocks, num_tokens, chunks
|
|
131
|
+
|
|
132
|
+
def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
133
|
+
chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None):
|
|
134
|
+
"""Process a table block with optional header backtrack"""
|
|
135
|
+
context = []
|
|
136
|
+
context_tokens = 0
|
|
137
|
+
|
|
138
|
+
# Backtrack for header only if 1-2 short blocks precede
|
|
139
|
+
count = 0
|
|
140
|
+
for j in range(block_idx - 1, -1, -1):
|
|
141
|
+
prev = all_blocks[j]
|
|
142
|
+
if prev.page != block.page:
|
|
143
|
+
break
|
|
144
|
+
if prev.block_type == 'Header':
|
|
145
|
+
if context_tokens + prev.tokens <= 128:
|
|
146
|
+
context.insert(0, prev)
|
|
147
|
+
context_tokens += prev.tokens
|
|
148
|
+
break
|
|
149
|
+
elif prev.block_type == 'Text' and prev.content.strip():
|
|
150
|
+
count += 1
|
|
151
|
+
if count > 2:
|
|
152
|
+
break
|
|
153
|
+
if context_tokens + prev.tokens <= 128:
|
|
154
|
+
context.insert(0, prev)
|
|
155
|
+
context_tokens += prev.tokens
|
|
156
|
+
else:
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
if num_tokens + context_tokens + block.tokens > self.chunk_size:
|
|
160
|
+
if chunk_blocks:
|
|
161
|
+
chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header)
|
|
162
|
+
|
|
163
|
+
# If we're backtracking context and the last chunk is ONLY that context, remove it
|
|
164
|
+
if context and chunks and len(chunks[-1].blocks) == len(context):
|
|
165
|
+
if all(chunks[-1].blocks[i] == context[i] for i in range(len(context))):
|
|
166
|
+
chunks.pop()
|
|
167
|
+
|
|
168
|
+
chunk_blocks = context + [block]
|
|
169
|
+
num_tokens = context_tokens + block.tokens
|
|
170
|
+
else:
|
|
171
|
+
chunk_blocks.extend(context + [block])
|
|
172
|
+
num_tokens += context_tokens + block.tokens
|
|
173
|
+
|
|
174
|
+
return chunk_blocks, num_tokens, chunks
|
|
175
|
+
|
|
176
|
+
def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
177
|
+
chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None):
|
|
178
|
+
"""Process a header block"""
|
|
179
|
+
if not chunk_blocks:
|
|
180
|
+
chunk_blocks.append(block)
|
|
181
|
+
num_tokens += block.tokens
|
|
182
|
+
return chunk_blocks, num_tokens, chunks
|
|
183
|
+
|
|
184
|
+
# Don't split if current content is small and next is a table
|
|
185
|
+
if next_block and next_block.block_type == 'Table' and num_tokens < self.chunk_overlap:
|
|
186
|
+
chunk_blocks.append(block)
|
|
187
|
+
num_tokens += block.tokens
|
|
188
|
+
return chunk_blocks, num_tokens, chunks
|
|
189
|
+
|
|
190
|
+
if num_tokens + block.tokens > self.chunk_size:
|
|
191
|
+
chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header)
|
|
192
|
+
chunk_blocks.append(block)
|
|
193
|
+
num_tokens += block.tokens
|
|
194
|
+
else:
|
|
195
|
+
chunk_blocks.append(block)
|
|
196
|
+
num_tokens += block.tokens
|
|
197
|
+
|
|
198
|
+
return chunk_blocks, num_tokens, chunks
|
|
199
|
+
|
|
200
|
+
def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None) -> Tuple[
|
|
201
|
+
List[MarkdownChunk], List[BaseBlock], int]:
|
|
202
|
+
"""Creates a chunk, and return a new list of blocks that """
|
|
203
|
+
chunks.append(MarkdownChunk(blocks=blocks, header=header))
|
|
204
|
+
|
|
205
|
+
if not self.chunk_overlap:
|
|
206
|
+
return chunks, [], 0
|
|
207
|
+
|
|
208
|
+
overlap_tokens = 0
|
|
209
|
+
overlap_blocks = []
|
|
210
|
+
|
|
211
|
+
for block in reversed(blocks):
|
|
212
|
+
if block.block_type == "Text":
|
|
213
|
+
sentences = []
|
|
214
|
+
|
|
215
|
+
for sentence in reversed(block.sentences):
|
|
216
|
+
|
|
217
|
+
if overlap_tokens + sentence.tokens > self.chunk_overlap:
|
|
218
|
+
text_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
|
|
219
|
+
overlap_blocks.insert(0, text_block)
|
|
220
|
+
return chunks, overlap_blocks, overlap_tokens
|
|
221
|
+
|
|
222
|
+
else:
|
|
223
|
+
sentences.insert(0, sentence)
|
|
224
|
+
overlap_tokens += sentence.tokens
|
|
225
|
+
|
|
226
|
+
else:
|
|
227
|
+
if overlap_tokens + block.tokens > self.chunk_overlap:
|
|
228
|
+
return chunks, overlap_blocks, overlap_tokens
|
|
229
|
+
|
|
230
|
+
else:
|
|
231
|
+
overlap_blocks.insert(0, block)
|
|
232
|
+
overlap_tokens += block.tokens
|
|
233
|
+
|
|
234
|
+
return chunks, [], 0
|
sec2md/chunking.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Chunking utilities for page-aware markdown splitting."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
from sec2md.models import Page, Section
|
|
5
|
+
from sec2md.chunker.markdown_chunker import MarkdownChunker
|
|
6
|
+
from sec2md.chunker.markdown_chunk import MarkdownChunk
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def chunk_pages(
|
|
10
|
+
pages: List[Page],
|
|
11
|
+
chunk_size: int = 512,
|
|
12
|
+
chunk_overlap: int = 128,
|
|
13
|
+
header: Optional[str] = None
|
|
14
|
+
) -> List[MarkdownChunk]:
|
|
15
|
+
"""
|
|
16
|
+
Chunk pages into overlapping markdown chunks.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
pages: List of Page objects
|
|
20
|
+
chunk_size: Target chunk size in tokens (estimated as chars/4)
|
|
21
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
22
|
+
header: Optional header to prepend to each chunk's embedding_text
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of MarkdownChunk objects with page tracking
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
>>> pages = sec2md.convert_to_markdown(html, return_pages=True)
|
|
29
|
+
>>> chunks = sec2md.chunk_pages(pages, chunk_size=512)
|
|
30
|
+
>>> for chunk in chunks:
|
|
31
|
+
... print(f"Page {chunk.page}: {chunk.content[:100]}...")
|
|
32
|
+
"""
|
|
33
|
+
chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
34
|
+
pages_data = [{"page": p.number, "content": p.content} for p in pages]
|
|
35
|
+
return chunker.split(pages=pages_data, header=header)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def chunk_section(
|
|
39
|
+
section: Section,
|
|
40
|
+
chunk_size: int = 512,
|
|
41
|
+
chunk_overlap: int = 128,
|
|
42
|
+
header: Optional[str] = None
|
|
43
|
+
) -> List[MarkdownChunk]:
|
|
44
|
+
"""
|
|
45
|
+
Chunk a filing section into overlapping markdown chunks.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
section: Section object from extract_sections()
|
|
49
|
+
chunk_size: Target chunk size in tokens (estimated as chars/4)
|
|
50
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
51
|
+
header: Optional header to prepend to each chunk's embedding_text
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of MarkdownChunk objects
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
>>> sections = sec2md.extract_sections(pages, filing_type="10-K")
|
|
58
|
+
>>> risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
|
|
59
|
+
>>> chunks = sec2md.chunk_section(risk, chunk_size=512)
|
|
60
|
+
"""
|
|
61
|
+
return chunk_pages(
|
|
62
|
+
pages=section.pages,
|
|
63
|
+
chunk_size=chunk_size,
|
|
64
|
+
chunk_overlap=chunk_overlap,
|
|
65
|
+
header=header
|
|
66
|
+
)
|
sec2md/core.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Core conversion functionality."""
|
|
2
|
+
|
|
3
|
+
from typing import overload, List
|
|
4
|
+
from sec2md.utils import is_url, fetch
|
|
5
|
+
from sec2md.parser import Parser
|
|
6
|
+
from sec2md.models import Page
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@overload
|
|
10
|
+
def convert_to_markdown(
|
|
11
|
+
source: str | bytes,
|
|
12
|
+
*,
|
|
13
|
+
user_agent: str | None = None,
|
|
14
|
+
return_pages: bool = False,
|
|
15
|
+
) -> str: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@overload
|
|
19
|
+
def convert_to_markdown(
|
|
20
|
+
source: str | bytes,
|
|
21
|
+
*,
|
|
22
|
+
user_agent: str | None = None,
|
|
23
|
+
return_pages: bool = True,
|
|
24
|
+
) -> List[Page]: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def convert_to_markdown(
|
|
28
|
+
source: str | bytes,
|
|
29
|
+
*,
|
|
30
|
+
user_agent: str | None = None,
|
|
31
|
+
return_pages: bool = False,
|
|
32
|
+
) -> str | List[Page]:
|
|
33
|
+
"""
|
|
34
|
+
Convert SEC filing HTML to Markdown.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
source: URL or HTML string/bytes
|
|
38
|
+
user_agent: User agent for EDGAR requests (required for sec.gov URLs)
|
|
39
|
+
return_pages: If True, returns List[Page] instead of markdown string
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Markdown string (default) or List[Page] if return_pages=True
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
ValueError: If source appears to be PDF content or other non-HTML format
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
>>> # From URL - get markdown
|
|
49
|
+
>>> md = convert_to_markdown(
|
|
50
|
+
... "https://www.sec.gov/Archives/edgar/data/.../10k.htm",
|
|
51
|
+
... user_agent="Lucas Astorian <lucas@intellifin.ai>"
|
|
52
|
+
... )
|
|
53
|
+
|
|
54
|
+
>>> # Get pages for section extraction
|
|
55
|
+
>>> pages = convert_to_markdown(filing.html(), return_pages=True)
|
|
56
|
+
|
|
57
|
+
>>> # With edgartools
|
|
58
|
+
>>> from edgar import Company, set_identity
|
|
59
|
+
>>> set_identity("Lucas Astorian <lucas@intellifin.ai>")
|
|
60
|
+
>>> company = Company('AAPL')
|
|
61
|
+
>>> filing = company.get_filings(form="10-K").latest()
|
|
62
|
+
>>> md = convert_to_markdown(filing.html())
|
|
63
|
+
"""
|
|
64
|
+
# Handle bytes input
|
|
65
|
+
if isinstance(source, bytes):
|
|
66
|
+
# Check if it's PDF
|
|
67
|
+
if source.startswith(b'%PDF'):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"PDF content detected. This library only supports HTML input. "
|
|
70
|
+
"Please extract HTML from the filing first."
|
|
71
|
+
)
|
|
72
|
+
source = source.decode('utf-8', errors='ignore')
|
|
73
|
+
|
|
74
|
+
# Check for PDF in string
|
|
75
|
+
if isinstance(source, str) and source.strip().startswith('%PDF'):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
"PDF content detected. This library only supports HTML input. "
|
|
78
|
+
"Please extract HTML from the filing first."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Fetch from URL if needed
|
|
82
|
+
if is_url(source):
|
|
83
|
+
html = fetch(source, user_agent=user_agent)
|
|
84
|
+
else:
|
|
85
|
+
html = source
|
|
86
|
+
|
|
87
|
+
# Parse and convert
|
|
88
|
+
parser = Parser(html)
|
|
89
|
+
|
|
90
|
+
if return_pages:
|
|
91
|
+
return parser.get_pages()
|
|
92
|
+
else:
|
|
93
|
+
return parser.markdown()
|