sec2md 0.1.5__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/__init__.py CHANGED
@@ -5,12 +5,12 @@ from sec2md.utils import flatten_note
5
5
  from sec2md.sections import extract_sections, get_section
6
6
  from sec2md.chunking import chunk_pages, chunk_section, merge_text_blocks, chunk_text_block
7
7
  from sec2md.models import Page, Section, Item10K, Item10Q, Item8K, FilingType, Element, TextBlock, Exhibit
8
- from sec2md.chunker.markdown_chunk import MarkdownChunk
9
- from sec2md.chunker.markdown_chunker import MarkdownChunker
8
+ from sec2md.chunker.chunk import Chunk
9
+ from sec2md.chunker.chunker import Chunker
10
10
  from sec2md.parser import Parser
11
11
  from sec2md.section_extractor import SectionExtractor
12
12
 
13
- __version__ = "0.1.5"
13
+ __version__ = "0.1.10"
14
14
  __all__ = [
15
15
  "convert_to_markdown",
16
16
  "flatten_note",
@@ -29,8 +29,8 @@ __all__ = [
29
29
  "Item10Q",
30
30
  "Item8K",
31
31
  "FilingType",
32
- "MarkdownChunk",
33
- "MarkdownChunker",
32
+ "Chunk",
33
+ "Chunker",
34
34
  "Parser",
35
35
  "SectionExtractor",
36
36
  ]
@@ -73,40 +73,14 @@ class TextBlock(BaseBlock):
73
73
  return cls(content=content, page=page, block_type='Text')
74
74
 
75
75
 
76
- class AudioParagraphBlock(BaseBlock):
77
- block_type: str = Field(default="Text", description="Audio paragraph block type")
78
- paragraph_id: int = Field(..., description="Paragraph ID")
79
- audio_start: float = Field(..., description="Audio start time")
80
- audio_end: float = Field(..., description="Audio end time")
81
-
82
- @computed_field
83
- @property
84
- def sentences(self) -> List[Sentence]:
85
- """Returns the text block sentences"""
86
- return [Sentence(content=content) for content in split_sentences(self.content)]
87
-
88
- def format(self) -> dict:
89
- """Formats the audio paragraphs"""
90
- return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
91
-
92
-
93
- class TableBlock(BaseModel):
76
+ class TableBlock(BaseBlock):
94
77
  block_type: str = Field(default='Table', description="Table block type")
95
- content: str = Field(..., description="Table content")
96
- page: int = Field(..., description="Page number")
97
-
98
- model_config = {"frozen": False}
99
78
 
100
79
  def __init__(self, **data):
101
80
  if 'content' in data:
102
81
  data['content'] = self._to_minified_markdown_static(data['content'])
103
82
  super().__init__(**data)
104
83
 
105
- @computed_field
106
- @property
107
- def tokens(self) -> int:
108
- return estimate_tokens(self.content)
109
-
110
84
  @staticmethod
111
85
  def _to_minified_markdown_static(content: str) -> str:
112
86
  """Returns the table in a Minified Markdown format"""
@@ -1,7 +1,7 @@
1
1
  from typing import List, Optional, Tuple, TYPE_CHECKING
2
2
  from pydantic import BaseModel, Field, computed_field
3
3
 
4
- from sec2md.chunker.markdown_blocks import BaseBlock
4
+ from sec2md.chunker.blocks import BaseBlock
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from sec2md.models import Element
@@ -9,8 +9,8 @@ else:
9
9
  Element = 'Element' # Forward reference for Pydantic
10
10
 
11
11
 
12
- class MarkdownChunk(BaseModel):
13
- """Represents a chunk of markdown content that can be embedded"""
12
+ class Chunk(BaseModel):
13
+ """Represents a chunk of content that can be embedded"""
14
14
 
15
15
  blocks: List[BaseBlock] = Field(..., description="List of markdown blocks in this chunk")
16
16
  header: Optional[str] = Field(None, description="Optional header for embedding context")
@@ -126,7 +126,7 @@ class MarkdownChunk(BaseModel):
126
126
 
127
127
  def __repr__(self):
128
128
  pages_str = f"{self.start_page}-{self.end_page}" if self.start_page != self.end_page else str(self.start_page)
129
- return f"MarkdownChunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
129
+ return f"Chunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
130
130
 
131
131
  def _repr_markdown_(self):
132
132
  """This method is called by IPython to display as Markdown"""
@@ -1,24 +1,24 @@
1
1
  import logging
2
2
  from typing import Union, Tuple, List, Dict, Any
3
3
 
4
- from sec2md.chunker.markdown_chunk import MarkdownChunk
5
- from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
4
+ from sec2md.chunker.chunk import Chunk
5
+ from sec2md.chunker.blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
6
6
 
7
- # Rebuild MarkdownChunk after Element is defined
7
+ # Rebuild Chunk after Element is defined
8
8
  from sec2md.models import Element
9
- MarkdownChunk.model_rebuild()
9
+ Chunk.model_rebuild()
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
- class MarkdownChunker:
15
- """Splits markdown content into chunks"""
14
+ class Chunker:
15
+ """Splits content into chunks"""
16
16
 
17
17
  def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
18
18
  self.chunk_size = chunk_size
19
19
  self.chunk_overlap = chunk_overlap
20
20
 
21
- def split(self, pages: List[Any], header: str = None) -> List[MarkdownChunk]:
21
+ def split(self, pages: List[Any], header: str = None) -> List[Chunk]:
22
22
  """Split the pages into chunks with optional header for embedding context.
23
23
 
24
24
  Args:
@@ -26,7 +26,7 @@ class MarkdownChunker:
26
26
  header: Optional header to prepend to each chunk's embedding_text
27
27
 
28
28
  Returns:
29
- List of MarkdownChunk objects
29
+ List of Chunk objects
30
30
  """
31
31
  # Build element map: page -> List[Element objects]
32
32
  page_elements = {}
@@ -64,14 +64,14 @@ class MarkdownChunker:
64
64
  last_page = page
65
65
 
66
66
  for line in page.content.split('\n'):
67
- if table_content and not MarkdownChunker._is_table_line(line):
67
+ if table_content and not Chunker._is_table_line(line):
68
68
  blocks.append(TableBlock(content=table_content, page=page.number))
69
69
  table_content = ""
70
70
 
71
71
  if line.startswith("#"):
72
72
  blocks.append(HeaderBlock(content=line, page=page.number))
73
73
 
74
- elif MarkdownChunker._is_table_line(line):
74
+ elif Chunker._is_table_line(line):
75
75
  table_content += f"{line}\n"
76
76
 
77
77
  else:
@@ -96,7 +96,7 @@ class MarkdownChunker:
96
96
  return True
97
97
  return True
98
98
 
99
- def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[MarkdownChunk]:
99
+ def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[Chunk]:
100
100
  """Converts the blocks to chunks"""
101
101
  page_elements = page_elements or {}
102
102
  chunks = []
@@ -127,7 +127,7 @@ class MarkdownChunker:
127
127
  return chunks
128
128
 
129
129
  def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
130
- chunks: List[MarkdownChunk], header: str = None, page_elements: dict = None):
130
+ chunks: List[Chunk], header: str = None, page_elements: dict = None):
131
131
  """Process a text block by breaking it into sentences if needed"""
132
132
  sentences = []
133
133
  sentences_tokens = 0
@@ -156,7 +156,7 @@ class MarkdownChunker:
156
156
  return chunk_blocks, num_tokens, chunks
157
157
 
158
158
  def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
159
- chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
159
+ chunks: List[Chunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
160
160
  """Process a table block with optional header backtrack"""
161
161
  context = []
162
162
  context_tokens = 0
@@ -200,7 +200,7 @@ class MarkdownChunker:
200
200
  return chunk_blocks, num_tokens, chunks
201
201
 
202
202
  def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
203
- chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
203
+ chunks: List[Chunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
204
204
  """Process a header block"""
205
205
  if not chunk_blocks:
206
206
  chunk_blocks.append(block)
@@ -223,17 +223,17 @@ class MarkdownChunker:
223
223
 
224
224
  return chunk_blocks, num_tokens, chunks
225
225
 
226
- def _finalize_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str, page_elements: dict):
226
+ def _finalize_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str, page_elements: dict):
227
227
  """Create chunk with elements from the pages it spans"""
228
228
  chunk_pages = set(block.page for block in blocks)
229
229
  elements = []
230
230
  for page_num in sorted(chunk_pages):
231
231
  if page_num in page_elements:
232
232
  elements.extend(page_elements[page_num])
233
- chunks.append(MarkdownChunk(blocks=blocks, header=header, elements=elements))
233
+ chunks.append(Chunk(blocks=blocks, header=header, elements=elements))
234
234
 
235
- def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
236
- List[MarkdownChunk], List[BaseBlock], int]:
235
+ def _create_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
236
+ List[Chunk], List[BaseBlock], int]:
237
237
  """Creates a chunk and returns overlap blocks"""
238
238
  page_elements = page_elements or {}
239
239
  self._finalize_chunk(chunks, blocks, header, page_elements)
sec2md/chunking.py CHANGED
@@ -1,10 +1,10 @@
1
- """Chunking utilities for page-aware markdown splitting."""
1
+ """Chunking utilities for page-aware splitting."""
2
2
 
3
3
  from typing import List, Optional
4
4
  from collections import defaultdict
5
5
  from sec2md.models import Page, Section, TextBlock
6
- from sec2md.chunker.markdown_chunker import MarkdownChunker
7
- from sec2md.chunker.markdown_chunk import MarkdownChunk
6
+ from sec2md.chunker.chunker import Chunker
7
+ from sec2md.chunker.chunk import Chunk
8
8
 
9
9
 
10
10
  def chunk_pages(
@@ -12,9 +12,9 @@ def chunk_pages(
12
12
  chunk_size: int = 512,
13
13
  chunk_overlap: int = 128,
14
14
  header: Optional[str] = None
15
- ) -> List[MarkdownChunk]:
15
+ ) -> List[Chunk]:
16
16
  """
17
- Chunk pages into overlapping markdown chunks.
17
+ Chunk pages into overlapping chunks.
18
18
 
19
19
  Args:
20
20
  pages: List of Page objects (with optional elements)
@@ -23,7 +23,7 @@ def chunk_pages(
23
23
  header: Optional header to prepend to each chunk's embedding_text
24
24
 
25
25
  Returns:
26
- List of MarkdownChunk objects with page tracking and elements
26
+ List of Chunk objects with page tracking and elements
27
27
 
28
28
  Example:
29
29
  >>> pages = sec2md.convert_to_markdown(html, return_pages=True, include_elements=True)
@@ -32,7 +32,7 @@ def chunk_pages(
32
32
  ... print(f"Page {chunk.page}: {chunk.content[:100]}...")
33
33
  ... print(f"Elements: {chunk.elements}")
34
34
  """
35
- chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
35
+ chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
36
36
  return chunker.split(pages=pages, header=header)
37
37
 
38
38
 
@@ -41,9 +41,9 @@ def chunk_section(
41
41
  chunk_size: int = 512,
42
42
  chunk_overlap: int = 128,
43
43
  header: Optional[str] = None
44
- ) -> List[MarkdownChunk]:
44
+ ) -> List[Chunk]:
45
45
  """
46
- Chunk a filing section into overlapping markdown chunks.
46
+ Chunk a filing section into overlapping chunks.
47
47
 
48
48
  Args:
49
49
  section: Section object from extract_sections()
@@ -52,7 +52,7 @@ def chunk_section(
52
52
  header: Optional header to prepend to each chunk's embedding_text
53
53
 
54
54
  Returns:
55
- List of MarkdownChunk objects
55
+ List of Chunk objects
56
56
 
57
57
  Example:
58
58
  >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
@@ -79,8 +79,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
79
79
 
80
80
  Returns:
81
81
  List of merged TextBlock objects with page metadata:
82
- - page_start: First page the note appears on
83
- - page_end: Last page the note appears on
82
+ - start_page: First page the note appears on
83
+ - end_page: Last page the note appears on
84
84
  - source_pages: All pages the note spans
85
85
  - elements: All elements from all pages
86
86
 
@@ -88,7 +88,7 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
88
88
  >>> pages = parser.get_pages(include_elements=True)
89
89
  >>> merged = merge_text_blocks(pages)
90
90
  >>> for tb in merged:
91
- ... print(f"{tb.title}: pages {tb.page_start}-{tb.page_end}")
91
+ ... print(f"{tb.title}: pages {tb.start_page}-{tb.end_page}")
92
92
  Debt Disclosure: pages 45-46
93
93
  Segment Reporting: pages 49-50
94
94
  """
@@ -97,8 +97,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
97
97
  "name": None,
98
98
  "title": None,
99
99
  "elements": [],
100
- "page_start": float('inf'),
101
- "page_end": -1,
100
+ "start_page": float('inf'),
101
+ "end_page": -1,
102
102
  "pages": set()
103
103
  })
104
104
 
@@ -108,8 +108,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
108
108
  tb_map[tb.name]["name"] = tb.name
109
109
  tb_map[tb.name]["title"] = tb.title
110
110
  tb_map[tb.name]["elements"].extend(tb.elements)
111
- tb_map[tb.name]["page_start"] = min(tb_map[tb.name]["page_start"], page.number)
112
- tb_map[tb.name]["page_end"] = max(tb_map[tb.name]["page_end"], page.number)
111
+ tb_map[tb.name]["start_page"] = min(tb_map[tb.name]["start_page"], page.number)
112
+ tb_map[tb.name]["end_page"] = max(tb_map[tb.name]["end_page"], page.number)
113
113
  tb_map[tb.name]["pages"].add(page.number)
114
114
 
115
115
  # Create merged TextBlock objects
@@ -119,8 +119,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
119
119
  name=tb_data["name"],
120
120
  title=tb_data["title"],
121
121
  elements=tb_data["elements"],
122
- page_start=tb_data["page_start"],
123
- page_end=tb_data["page_end"],
122
+ start_page=tb_data["start_page"],
123
+ end_page=tb_data["end_page"],
124
124
  source_pages=sorted(tb_data["pages"])
125
125
  )
126
126
  merged.append(tb)
@@ -132,8 +132,8 @@ def chunk_text_block(
132
132
  text_block: TextBlock,
133
133
  chunk_size: int = 512,
134
134
  chunk_overlap: int = 128,
135
- include_title_as_header: bool = True
136
- ) -> List[MarkdownChunk]:
135
+ header: Optional[str] = None
136
+ ) -> List[Chunk]:
137
137
  """
138
138
  Chunk a single TextBlock (financial note).
139
139
 
@@ -141,17 +141,17 @@ def chunk_text_block(
141
141
  text_block: TextBlock object (possibly spanning multiple pages)
142
142
  chunk_size: Target chunk size in tokens (estimated as chars/4)
143
143
  chunk_overlap: Overlap between chunks in tokens
144
- include_title_as_header: Prepend note title to chunks for embedding
144
+ header: Optional header to prepend to each chunk's embedding_text
145
145
 
146
146
  Returns:
147
- List of MarkdownChunk objects with elements preserved
147
+ List of Chunk objects with elements preserved
148
148
 
149
149
  Example:
150
150
  >>> merged = merge_text_blocks(pages)
151
151
  >>> debt_note = [tb for tb in merged if "Debt" in tb.title][0]
152
- >>> chunks = chunk_text_block(debt_note, chunk_size=512)
152
+ >>> chunks = chunk_text_block(debt_note, chunk_size=512, header="Company: AAPL | Note: Debt")
153
153
  >>> print(f"Chunked {debt_note.title} into {len(chunks)} chunks")
154
- >>> print(f"Note spans pages {debt_note.page_start}-{debt_note.page_end}")
154
+ >>> print(f"Note spans pages {debt_note.start_page}-{debt_note.end_page}")
155
155
  """
156
156
  # Group elements by page
157
157
  elements_by_page = defaultdict(list)
@@ -172,8 +172,6 @@ def chunk_text_block(
172
172
  elements=elems # Only elements from this page
173
173
  ))
174
174
 
175
- # Chunk normally across all pages
176
- header = f"Note: {text_block.title}" if include_title_as_header and text_block.title else None
177
- chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
175
+ chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
178
176
 
179
177
  return chunker.split(pages=pages, header=header)
sec2md/models.py CHANGED
@@ -248,8 +248,8 @@ class TextBlock(BaseModel):
248
248
  elements: List['Element'] = Field(default_factory=list, description="Element objects in this TextBlock")
249
249
 
250
250
  # Optional: Set by merge_text_blocks() for multi-page notes
251
- page_start: Optional[int] = Field(None, description="First page this TextBlock appears on")
252
- page_end: Optional[int] = Field(None, description="Last page this TextBlock appears on")
251
+ start_page: Optional[int] = Field(None, description="First page this TextBlock appears on")
252
+ end_page: Optional[int] = Field(None, description="Last page this TextBlock appears on")
253
253
  source_pages: Optional[List[int]] = Field(None, description="All pages this TextBlock spans")
254
254
 
255
255
  model_config = {"frozen": False, "arbitrary_types_allowed": True}
@@ -261,7 +261,7 @@ class TextBlock(BaseModel):
261
261
  return [e.id for e in self.elements]
262
262
 
263
263
  def __repr__(self) -> str:
264
- pages_info = f", pages={self.page_start}-{self.page_end}" if self.page_start else ""
264
+ pages_info = f", pages={self.start_page}-{self.end_page}" if self.start_page else ""
265
265
  return f"TextBlock(name='{self.name}', title='{self.title}', elements={len(self.elements)}{pages_info})"
266
266
 
267
267
 
sec2md/parser.py CHANGED
@@ -710,7 +710,7 @@ class Parser:
710
710
 
711
711
  return current
712
712
 
713
- def get_pages(self, include_elements: bool = False) -> List[Page]:
713
+ def get_pages(self, include_elements: bool = True) -> List[Page]:
714
714
  """Get parsed pages as Page objects.
715
715
 
716
716
  Args:
@@ -269,6 +269,12 @@ class SectionExtractor:
269
269
  title_inline = (m.group(3) or "").strip()
270
270
  # Clean markdown artifacts from title
271
271
  title_inline = MD_EDGE.sub("", title_inline)
272
+
273
+ # Skip TOC entries (they have page numbers like "| 3 |" in the title)
274
+ if re.search(r'\|\s*\d+\s*\|', title_inline):
275
+ self._log(f"DEBUG: Skipping TOC entry for ITEM {code}")
276
+ continue
277
+
272
278
  title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
273
279
  headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
274
280
  self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
@@ -303,6 +309,11 @@ class SectionExtractor:
303
309
  # Since 8-K sections can span pages, we need to find which pages contain this content
304
310
  section_pages = self._map_8k_content_to_pages(body)
305
311
 
312
+ # Skip sections with no matching pages
313
+ if not section_pages:
314
+ self._log(f"DEBUG: Skipping ITEM {code} (no pages found)")
315
+ continue
316
+
306
317
  # Create Section with exhibits (now part of the model)
307
318
  section = Section(
308
319
  part=None, # 8-K has no PART divisions
@@ -319,40 +330,67 @@ class SectionExtractor:
319
330
  return results
320
331
 
321
332
  def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
322
- """Map extracted section content back to Page objects."""
333
+ """Map extracted section content back to Page objects, splitting at section boundaries."""
323
334
  from sec2md.models import Page
324
335
 
325
- # Try to find which original pages contain this content
326
- # This is heuristic-based: match by content overlap
327
336
  matched_pages = []
328
- section_preview = section_content[:500] # Use first 500 chars for matching
337
+ section_content_cleaned = self._clean_8k_text(section_content)
338
+ remaining_section = section_content_cleaned
329
339
 
330
340
  for page_dict in self.pages:
331
341
  page_num = page_dict["page"]
332
- page_content = self._clean_8k_text(page_dict["content"])
342
+ page_content = page_dict["content"]
343
+ page_content_cleaned = self._clean_8k_text(page_content)
333
344
 
334
- # Check if this page contains part of the section
335
- if section_preview in page_content or page_content in section_content:
336
- original_page = self._original_pages.get(page_num)
337
- matched_pages.append(
338
- Page(
339
- number=page_num,
340
- content=page_content,
341
- elements=original_page.elements if original_page else None,
342
- text_blocks=original_page.text_blocks if original_page else None
343
- )
344
- )
345
+ # Skip pages that don't contain any of the remaining section content
346
+ if not any(chunk in page_content_cleaned for chunk in remaining_section[:200].split()[:10]):
347
+ continue
348
+
349
+ # Find where the section content appears on this page
350
+ # Use the original page to preserve formatting/elements
351
+ original_page = self._original_pages[page_num]
352
+
353
+ # For 8-K, we need to split the page content at ITEM boundaries
354
+ # Find all ITEM headers on this page
355
+ item_positions = []
356
+ for m in self._ITEM_8K_RE.finditer(page_content_cleaned):
357
+ code = self._normalize_8k_item_code(m.group(2))
358
+ title = (m.group(3) or "").strip()
359
+ # Skip TOC entries
360
+ if not re.search(r'\|\s*\d+\s*\|', title):
361
+ item_positions.append((m.start(), f"ITEM {code}"))
362
+
363
+ # Find which portion of the page belongs to this section
364
+ section_start_in_page = page_content_cleaned.find(section_content_cleaned[:100])
365
+
366
+ if section_start_in_page >= 0:
367
+ # Find the end: either next ITEM on this page, or end of page
368
+ section_end_in_page = len(page_content_cleaned)
369
+ for pos, item_code in item_positions:
370
+ # Find the next ITEM after our section starts
371
+ if pos > section_start_in_page + 50: # Give 50 chars buffer
372
+ section_end_in_page = pos
373
+ break
374
+
375
+ # Extract just this section's content from the page
376
+ page_section_content = page_content_cleaned[section_start_in_page:section_end_in_page].strip()
345
377
 
346
- # If no matches found (shouldn't happen), create a synthetic page
347
- if not matched_pages:
348
- matched_pages.append(
349
- Page(
350
- number=1,
351
- content=section_content,
352
- elements=None,
378
+ # Create a new Page with only this section's content
379
+ # Note: This loses elements, but keeps the section boundary clean
380
+ matched_pages.append(Page(
381
+ number=page_num,
382
+ content=page_section_content,
383
+ elements=None, # TODO: Could filter elements by content matching
353
384
  text_blocks=None
354
- )
355
- )
385
+ ))
386
+
387
+ # Update remaining section content to find on next pages
388
+ # Remove what we've matched from the section
389
+ matched_len = len(page_section_content)
390
+ remaining_section = remaining_section[matched_len:] if matched_len < len(remaining_section) else ""
391
+
392
+ if not remaining_section.strip():
393
+ break # Found all content for this section
356
394
 
357
395
  return matched_pages
358
396
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sec2md
3
- Version: 0.1.5
3
+ Version: 0.1.10
4
4
  Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
5
5
  Author-email: Lucas Astorian <lucas@intellifin.ai>
6
6
  License: MIT
@@ -0,0 +1,19 @@
1
+ sec2md/__init__.py,sha256=WHduz6dNVQ_pLZ-OMs-9ikWD8Brc0HdHh1sfo_ygQYU,988
2
+ sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
3
+ sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
4
+ sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
5
+ sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
6
+ sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
7
+ sec2md/section_extractor.py,sha256=otx4RObfNqP1zStilis9z4gDXp4mkN-9-tzIMACEIaE,28050
8
+ sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
9
+ sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
10
+ sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
11
+ sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
13
+ sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
14
+ sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
15
+ sec2md-0.1.10.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
16
+ sec2md-0.1.10.dist-info/METADATA,sha256=xW9Jin_IALBKHTlFzHnY9inkHmKLmf9jCio5jYc-EnY,7626
17
+ sec2md-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ sec2md-0.1.10.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
19
+ sec2md-0.1.10.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- sec2md/__init__.py,sha256=iR_2g-PDkCAzY76uQwBjIVpprvkxlNopdmDduzDp8lg,1037
2
- sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
3
- sec2md/chunking.py,sha256=SQASDA057bKLhSj34GNAHrRl94Rf-A9WlfEvhhWPuIc,6350
4
- sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
5
- sec2md/models.py,sha256=H_3HnI8exGVnbqbdT1Bf4bNhPLjqvlP64ud0au5ohJk,14735
6
- sec2md/parser.py,sha256=J1He6XMa1Mf9YGJCEffWuCs7SAqi0Ts6S445CTO-lAA,47559
7
- sec2md/section_extractor.py,sha256=JTbZpPgmTipzU1Q5LehlQ9y2X4ZcQRTj3A7iMr90iqM,25976
8
- sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
9
- sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
10
- sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
11
- sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sec2md/chunker/markdown_blocks.py,sha256=yEF_v72DvYOVu0ZQ5bBCFpNM12INg-8RmajIu_dorQQ,4372
13
- sec2md/chunker/markdown_chunk.py,sha256=hCMpjn0cc5TIjWSZviq4fM7e781X3AtRcmI60pDLWro,4763
14
- sec2md/chunker/markdown_chunker.py,sha256=IYW8pQ2q9hX1lRGw4TnKAQcr-HmJfSW7wffu-BA0Jms,10743
15
- sec2md-0.1.5.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
16
- sec2md-0.1.5.dist-info/METADATA,sha256=YWQ9uiut1LcBQxOCvFcT8MlfgLO7VBCDtEju5h7fp6k,7625
17
- sec2md-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- sec2md-0.1.5.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
19
- sec2md-0.1.5.dist-info/RECORD,,