sec2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

@@ -0,0 +1,116 @@
1
+ import re
2
+ from abc import ABC
3
+ from typing import List
4
+
5
+
6
+ def estimate_tokens(text: str) -> int:
7
+ """
8
+ Estimate token count using character/4 heuristic.
9
+
10
+ This is a simple approximation. For exact token counting,
11
+ use your embedding provider's tokenizer.
12
+ """
13
+ return max(1, len(text) // 4)
14
+
15
+
16
+ def split_sentences(text: str) -> List[str]:
17
+ """Simple regex-based sentence splitter"""
18
+ # Split on .!? followed by whitespace and capital letter or end of string
19
+ # Handles common abbreviations like Mr., Dr., Inc., etc.
20
+ sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
21
+ return [s.strip() for s in sentences if s.strip()]
22
+
23
+
24
+ class BaseBlock(ABC):
25
+ block_type: str
26
+
27
+ def __init__(self, content: str, page: int):
28
+ self.content = content
29
+ self.page = page
30
+
31
+ @property
32
+ def tokens(self) -> int:
33
+ return estimate_tokens(self.content)
34
+
35
+
36
+ class Sentence:
37
+
38
+ def __init__(self, content: str):
39
+ self.content = content
40
+
41
+ @property
42
+ def tokens(self) -> int:
43
+ return estimate_tokens(self.content)
44
+
45
+
46
+ class TextBlock(BaseBlock):
47
+ block_type: str = 'Text'
48
+
49
+ def __init__(self, content: str, page: int):
50
+ super().__init__(content=content, page=page)
51
+
52
+ @property
53
+ def sentences(self) -> List[Sentence]:
54
+ """Returns the text block sentences"""
55
+ return [Sentence(content=content) for content in split_sentences(self.content)]
56
+
57
+ @classmethod
58
+ def from_sentences(cls, sentences: List[Sentence], page: int):
59
+ content = " ".join([sentence.content for sentence in sentences])
60
+ return cls(content=content, page=page)
61
+
62
+
63
+ class AudioParagraphBlock(BaseBlock):
64
+ block_type: str = "Text"
65
+
66
+ def __init__(self, content: str, page: int, paragraph_id: int, audio_start: float, audio_end: float):
67
+ super().__init__(content=content, page=page)
68
+ self.paragraph_id = paragraph_id
69
+ self.audio_start = audio_start
70
+ self.audio_end = audio_end
71
+
72
+ @property
73
+ def sentences(self) -> List[Sentence]:
74
+ """Returns the text block sentences"""
75
+ return [Sentence(content=content) for content in split_sentences(self.content)]
76
+
77
+ def format(self) -> dict:
78
+ """Formats the audio paragraphs"""
79
+ return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
80
+
81
+
82
+ class TableBlock(BaseBlock):
83
+ block_type: str = 'Table'
84
+
85
+ def __init__(self, content: str, page: int):
86
+ super().__init__(content=content, page=page)
87
+ self.content = self._to_minified_markdown()
88
+
89
+ def _to_minified_markdown(self) -> str:
90
+ """Returns the table in a Minified Markdown format"""
91
+ lines = self.content.split('\n')
92
+ cleaned_lines = []
93
+
94
+ for i, line in enumerate(lines):
95
+ if not line.strip():
96
+ continue
97
+
98
+ parts = line.split('|')
99
+ cleaned_parts = [re.sub(r'\s+', ' ', part.strip()) for part in parts]
100
+ cleaned_line = '|'.join(cleaned_parts)
101
+
102
+ if i == 1:
103
+ num_cols = len(cleaned_parts) - 1
104
+ separator = '|' + '|'.join(['---'] * num_cols) + '|'
105
+ cleaned_lines.append(separator)
106
+ else:
107
+ cleaned_lines.append(cleaned_line)
108
+
109
+ return '\n'.join(cleaned_lines)
110
+
111
+
112
+ class HeaderBlock(BaseBlock):
113
+ block_type = 'Header'
114
+
115
+ def __init__(self, content: str, page: int):
116
+ super().__init__(content=content, page=page)
@@ -0,0 +1,76 @@
1
+ from typing import List, Optional
2
+
3
+ from sec2md.chunker.markdown_blocks import BaseBlock
4
+
5
+
6
+ class MarkdownChunk:
7
+ """Represents a chunk of markdown content that can be embedded"""
8
+
9
+ def __init__(self, blocks: List[BaseBlock], header: Optional[str] = None):
10
+ """Initialize a markdown chunk with blocks and optional header for embedding"""
11
+ self.vector: Optional[List[float]] = None
12
+ self.blocks = blocks
13
+ self.page = blocks[0].page
14
+ self.header = header
15
+
16
+ def set_vector(self, vector: List[float]):
17
+ """Set the vector embedding for this chunk"""
18
+ self.vector = vector
19
+
20
+ @property
21
+ def content(self) -> str:
22
+ """Get the text content of this chunk"""
23
+ return "\n".join([block.content for block in self.blocks])
24
+
25
+ @property
26
+ def data(self) -> List[dict]:
27
+ """Returns a list of block data grouped by page with ONLY the chunk's content"""
28
+ page_blocks = {}
29
+
30
+ for block in self.blocks:
31
+ if block.page not in page_blocks:
32
+ page_blocks[block.page] = []
33
+ page_blocks[block.page].append(block)
34
+
35
+ page_content_data = []
36
+ for page, blocks in page_blocks.items():
37
+ # Only include the content from blocks in THIS chunk, not full page content
38
+ page_content = "\n".join(block.content for block in blocks)
39
+ if not page_content.strip():
40
+ continue
41
+
42
+ page_content_data.append({
43
+ "page": page,
44
+ "content": page_content
45
+ })
46
+
47
+ return sorted(page_content_data, key=lambda x: x["page"])
48
+
49
+ @property
50
+ def pages(self) -> List[dict]:
51
+ """Returns a list of pages with ONLY this chunk's content (not full page content)"""
52
+ return self.data
53
+
54
+ @property
55
+ def embedding_text(self) -> str:
56
+ """Get the text to use for embedding, with optional header prepended"""
57
+ if self.header:
58
+ return f"{self.header}\n\n...\n\n{self.content}"
59
+ return self.content
60
+
61
+ @property
62
+ def has_table(self) -> bool:
63
+ """Returns True if this chunk contains one or more table blocks"""
64
+ return any(block.block_type == 'Table' for block in self.blocks)
65
+
66
+ @property
67
+ def num_tokens(self) -> int:
68
+ """Returns the total number of tokens in this chunk"""
69
+ return sum(block.tokens for block in self.blocks)
70
+
71
+ def __repr__(self):
72
+ return f"MarkdownChunk(page={self.page}, blocks={len(self.blocks)})"
73
+
74
+ def _repr_markdown_(self):
75
+ """This method is called by IPython to display as Markdown"""
76
+ return self.content
@@ -0,0 +1,234 @@
1
+ import logging
2
+ from typing import Union, Tuple, List, Dict
3
+
4
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
5
+ from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class MarkdownChunker:
11
+ """Splits markdown content into chunks"""
12
+
13
+ def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
14
+ self.chunk_size = chunk_size
15
+ self.chunk_overlap = chunk_overlap
16
+
17
+ def split(self, pages: List[Dict[str, Union[int, str]]], header: str = None) -> List[MarkdownChunk]:
18
+ """Split the pages into chunks with optional header for embedding context"""
19
+ blocks = self._split_into_blocks(pages=pages)
20
+ return self._chunk_blocks(blocks=blocks, header=header)
21
+
22
+ def chunk_text(self, text: str) -> List[str]:
23
+ """Chunk a single text string into multiple chunks"""
24
+ pages = [{"page": 0, "content": text}]
25
+ chunks = self.split(pages=pages)
26
+ return [chunk.content for chunk in chunks]
27
+
28
+ @staticmethod
29
+ def _split_into_blocks(pages: List[Dict[str, Union[int, str]]]):
30
+ """Splits the page into blocks"""
31
+ blocks = []
32
+ table_content = ""
33
+ last_page = None
34
+
35
+ for page in pages:
36
+ last_page = page['page']
37
+ for line in page['content'].split('\n'):
38
+ if table_content and not MarkdownChunker._is_table_line(line):
39
+ block = TableBlock(content=table_content, page=page['page'])
40
+ blocks.append(block)
41
+ table_content = ""
42
+
43
+ if line.startswith("#"):
44
+ block = HeaderBlock(content=line, page=page['page'])
45
+ blocks.append(block)
46
+
47
+ elif MarkdownChunker._is_table_line(line):
48
+ table_content += f"{line}\n"
49
+
50
+ else:
51
+ block = TextBlock(content=line, page=page['page'])
52
+ blocks.append(block)
53
+
54
+ if table_content and last_page is not None:
55
+ block = TableBlock(content=table_content, page=last_page)
56
+ blocks.append(block)
57
+
58
+ return blocks
59
+
60
+ @staticmethod
61
+ def _is_table_line(line: str) -> bool:
62
+ import re
63
+ if '|' not in line:
64
+ return False
65
+ stripped = line.strip()
66
+ if not stripped:
67
+ return False
68
+ align_pattern = re.compile(r'^\s*:?-+:?\s*$')
69
+ cells = [c.strip() for c in stripped.strip('|').split('|')]
70
+ if all(align_pattern.match(c) for c in cells):
71
+ return True
72
+ return True
73
+
74
+ def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None) -> List[MarkdownChunk]:
75
+ """Converts the blocks to chunks"""
76
+ chunks = []
77
+ chunk_blocks = []
78
+ num_tokens = 0
79
+
80
+ for i, block in enumerate(blocks):
81
+ next_block = blocks[i + 1] if i + 1 < len(blocks) else None
82
+
83
+ if block.block_type == 'Text':
84
+ chunk_blocks, num_tokens, chunks = self._process_text_block(
85
+ block, chunk_blocks, num_tokens, chunks, header
86
+ )
87
+
88
+ elif block.block_type == 'Table':
89
+ chunk_blocks, num_tokens, chunks = self._process_table_block(
90
+ block, chunk_blocks, num_tokens, chunks, blocks, i, header
91
+ )
92
+
93
+ else:
94
+ chunk_blocks, num_tokens, chunks = self._process_header_table_block(
95
+ block, chunk_blocks, num_tokens, chunks, next_block, header
96
+ )
97
+
98
+ if chunk_blocks:
99
+ chunks.append(MarkdownChunk(blocks=chunk_blocks, header=header))
100
+
101
+ return chunks
102
+
103
+ def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
104
+ chunks: List[MarkdownChunk], header: str = None):
105
+ """Process a text block by breaking it into sentences if needed"""
106
+ sentences = []
107
+ sentences_tokens = 0
108
+
109
+ for sentence in block.sentences:
110
+ if num_tokens + sentences_tokens + sentence.tokens > self.chunk_size:
111
+ if sentences:
112
+ new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
113
+ chunk_blocks.append(new_block)
114
+ num_tokens += sentences_tokens
115
+
116
+ chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header)
117
+
118
+ sentences = [sentence]
119
+ sentences_tokens = sentence.tokens
120
+
121
+ else:
122
+ sentences.append(sentence)
123
+ sentences_tokens += sentence.tokens
124
+
125
+ if sentences:
126
+ new_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
127
+ chunk_blocks.append(new_block)
128
+ num_tokens += sentences_tokens
129
+
130
+ return chunk_blocks, num_tokens, chunks
131
+
132
+ def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
133
+ chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None):
134
+ """Process a table block with optional header backtrack"""
135
+ context = []
136
+ context_tokens = 0
137
+
138
+ # Backtrack for header only if 1-2 short blocks precede
139
+ count = 0
140
+ for j in range(block_idx - 1, -1, -1):
141
+ prev = all_blocks[j]
142
+ if prev.page != block.page:
143
+ break
144
+ if prev.block_type == 'Header':
145
+ if context_tokens + prev.tokens <= 128:
146
+ context.insert(0, prev)
147
+ context_tokens += prev.tokens
148
+ break
149
+ elif prev.block_type == 'Text' and prev.content.strip():
150
+ count += 1
151
+ if count > 2:
152
+ break
153
+ if context_tokens + prev.tokens <= 128:
154
+ context.insert(0, prev)
155
+ context_tokens += prev.tokens
156
+ else:
157
+ break
158
+
159
+ if num_tokens + context_tokens + block.tokens > self.chunk_size:
160
+ if chunk_blocks:
161
+ chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header)
162
+
163
+ # If we're backtracking context and the last chunk is ONLY that context, remove it
164
+ if context and chunks and len(chunks[-1].blocks) == len(context):
165
+ if all(chunks[-1].blocks[i] == context[i] for i in range(len(context))):
166
+ chunks.pop()
167
+
168
+ chunk_blocks = context + [block]
169
+ num_tokens = context_tokens + block.tokens
170
+ else:
171
+ chunk_blocks.extend(context + [block])
172
+ num_tokens += context_tokens + block.tokens
173
+
174
+ return chunk_blocks, num_tokens, chunks
175
+
176
+ def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
177
+ chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None):
178
+ """Process a header block"""
179
+ if not chunk_blocks:
180
+ chunk_blocks.append(block)
181
+ num_tokens += block.tokens
182
+ return chunk_blocks, num_tokens, chunks
183
+
184
+ # Don't split if current content is small and next is a table
185
+ if next_block and next_block.block_type == 'Table' and num_tokens < self.chunk_overlap:
186
+ chunk_blocks.append(block)
187
+ num_tokens += block.tokens
188
+ return chunk_blocks, num_tokens, chunks
189
+
190
+ if num_tokens + block.tokens > self.chunk_size:
191
+ chunks, chunk_blocks, num_tokens = self._create_chunk(chunks=chunks, blocks=chunk_blocks, header=header)
192
+ chunk_blocks.append(block)
193
+ num_tokens += block.tokens
194
+ else:
195
+ chunk_blocks.append(block)
196
+ num_tokens += block.tokens
197
+
198
+ return chunk_blocks, num_tokens, chunks
199
+
200
+ def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None) -> Tuple[
201
+ List[MarkdownChunk], List[BaseBlock], int]:
202
+ """Creates a chunk, and return a new list of blocks that """
203
+ chunks.append(MarkdownChunk(blocks=blocks, header=header))
204
+
205
+ if not self.chunk_overlap:
206
+ return chunks, [], 0
207
+
208
+ overlap_tokens = 0
209
+ overlap_blocks = []
210
+
211
+ for block in reversed(blocks):
212
+ if block.block_type == "Text":
213
+ sentences = []
214
+
215
+ for sentence in reversed(block.sentences):
216
+
217
+ if overlap_tokens + sentence.tokens > self.chunk_overlap:
218
+ text_block = TextBlock.from_sentences(sentences=sentences, page=block.page)
219
+ overlap_blocks.insert(0, text_block)
220
+ return chunks, overlap_blocks, overlap_tokens
221
+
222
+ else:
223
+ sentences.insert(0, sentence)
224
+ overlap_tokens += sentence.tokens
225
+
226
+ else:
227
+ if overlap_tokens + block.tokens > self.chunk_overlap:
228
+ return chunks, overlap_blocks, overlap_tokens
229
+
230
+ else:
231
+ overlap_blocks.insert(0, block)
232
+ overlap_tokens += block.tokens
233
+
234
+ return chunks, [], 0
sec2md/chunking.py ADDED
@@ -0,0 +1,66 @@
1
+ """Chunking utilities for page-aware markdown splitting."""
2
+
3
+ from typing import List, Optional
4
+ from sec2md.models import Page, Section
5
+ from sec2md.chunker.markdown_chunker import MarkdownChunker
6
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
7
+
8
+
9
+ def chunk_pages(
10
+ pages: List[Page],
11
+ chunk_size: int = 512,
12
+ chunk_overlap: int = 128,
13
+ header: Optional[str] = None
14
+ ) -> List[MarkdownChunk]:
15
+ """
16
+ Chunk pages into overlapping markdown chunks.
17
+
18
+ Args:
19
+ pages: List of Page objects
20
+ chunk_size: Target chunk size in tokens (estimated as chars/4)
21
+ chunk_overlap: Overlap between chunks in tokens
22
+ header: Optional header to prepend to each chunk's embedding_text
23
+
24
+ Returns:
25
+ List of MarkdownChunk objects with page tracking
26
+
27
+ Example:
28
+ >>> pages = sec2md.convert_to_markdown(html, return_pages=True)
29
+ >>> chunks = sec2md.chunk_pages(pages, chunk_size=512)
30
+ >>> for chunk in chunks:
31
+ ... print(f"Page {chunk.page}: {chunk.content[:100]}...")
32
+ """
33
+ chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
+ pages_data = [{"page": p.number, "content": p.content} for p in pages]
35
+ return chunker.split(pages=pages_data, header=header)
36
+
37
+
38
+ def chunk_section(
39
+ section: Section,
40
+ chunk_size: int = 512,
41
+ chunk_overlap: int = 128,
42
+ header: Optional[str] = None
43
+ ) -> List[MarkdownChunk]:
44
+ """
45
+ Chunk a filing section into overlapping markdown chunks.
46
+
47
+ Args:
48
+ section: Section object from extract_sections()
49
+ chunk_size: Target chunk size in tokens (estimated as chars/4)
50
+ chunk_overlap: Overlap between chunks in tokens
51
+ header: Optional header to prepend to each chunk's embedding_text
52
+
53
+ Returns:
54
+ List of MarkdownChunk objects
55
+
56
+ Example:
57
+ >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
58
+ >>> risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
59
+ >>> chunks = sec2md.chunk_section(risk, chunk_size=512)
60
+ """
61
+ return chunk_pages(
62
+ pages=section.pages,
63
+ chunk_size=chunk_size,
64
+ chunk_overlap=chunk_overlap,
65
+ header=header
66
+ )
sec2md/core.py ADDED
@@ -0,0 +1,93 @@
1
+ """Core conversion functionality."""
2
+
3
+ from typing import overload, List
4
+ from sec2md.utils import is_url, fetch
5
+ from sec2md.parser import Parser
6
+ from sec2md.models import Page
7
+
8
+
9
+ @overload
10
+ def convert_to_markdown(
11
+ source: str | bytes,
12
+ *,
13
+ user_agent: str | None = None,
14
+ return_pages: bool = False,
15
+ ) -> str: ...
16
+
17
+
18
+ @overload
19
+ def convert_to_markdown(
20
+ source: str | bytes,
21
+ *,
22
+ user_agent: str | None = None,
23
+ return_pages: bool = True,
24
+ ) -> List[Page]: ...
25
+
26
+
27
+ def convert_to_markdown(
28
+ source: str | bytes,
29
+ *,
30
+ user_agent: str | None = None,
31
+ return_pages: bool = False,
32
+ ) -> str | List[Page]:
33
+ """
34
+ Convert SEC filing HTML to Markdown.
35
+
36
+ Args:
37
+ source: URL or HTML string/bytes
38
+ user_agent: User agent for EDGAR requests (required for sec.gov URLs)
39
+ return_pages: If True, returns List[Page] instead of markdown string
40
+
41
+ Returns:
42
+ Markdown string (default) or List[Page] if return_pages=True
43
+
44
+ Raises:
45
+ ValueError: If source appears to be PDF content or other non-HTML format
46
+
47
+ Examples:
48
+ >>> # From URL - get markdown
49
+ >>> md = convert_to_markdown(
50
+ ... "https://www.sec.gov/Archives/edgar/data/.../10k.htm",
51
+ ... user_agent="Lucas Astorian <lucas@intellifin.ai>"
52
+ ... )
53
+
54
+ >>> # Get pages for section extraction
55
+ >>> pages = convert_to_markdown(filing.html(), return_pages=True)
56
+
57
+ >>> # With edgartools
58
+ >>> from edgar import Company, set_identity
59
+ >>> set_identity("Lucas Astorian <lucas@intellifin.ai>")
60
+ >>> company = Company('AAPL')
61
+ >>> filing = company.get_filings(form="10-K").latest()
62
+ >>> md = convert_to_markdown(filing.html())
63
+ """
64
+ # Handle bytes input
65
+ if isinstance(source, bytes):
66
+ # Check if it's PDF
67
+ if source.startswith(b'%PDF'):
68
+ raise ValueError(
69
+ "PDF content detected. This library only supports HTML input. "
70
+ "Please extract HTML from the filing first."
71
+ )
72
+ source = source.decode('utf-8', errors='ignore')
73
+
74
+ # Check for PDF in string
75
+ if isinstance(source, str) and source.strip().startswith('%PDF'):
76
+ raise ValueError(
77
+ "PDF content detected. This library only supports HTML input. "
78
+ "Please extract HTML from the filing first."
79
+ )
80
+
81
+ # Fetch from URL if needed
82
+ if is_url(source):
83
+ html = fetch(source, user_agent=user_agent)
84
+ else:
85
+ html = source
86
+
87
+ # Parse and convert
88
+ parser = Parser(html)
89
+
90
+ if return_pages:
91
+ return parser.get_pages()
92
+ else:
93
+ return parser.markdown()