sec2md 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/__init__.py CHANGED
@@ -5,12 +5,12 @@ from sec2md.utils import flatten_note
5
5
  from sec2md.sections import extract_sections, get_section
6
6
  from sec2md.chunking import chunk_pages, chunk_section, merge_text_blocks, chunk_text_block
7
7
  from sec2md.models import Page, Section, Item10K, Item10Q, Item8K, FilingType, Element, TextBlock, Exhibit
8
- from sec2md.chunker.markdown_chunk import MarkdownChunk
9
- from sec2md.chunker.markdown_chunker import MarkdownChunker
8
+ from sec2md.chunker.chunk import Chunk
9
+ from sec2md.chunker.chunker import Chunker
10
10
  from sec2md.parser import Parser
11
11
  from sec2md.section_extractor import SectionExtractor
12
12
 
13
- __version__ = "0.1.5"
13
+ __version__ = "0.1.12"
14
14
  __all__ = [
15
15
  "convert_to_markdown",
16
16
  "flatten_note",
@@ -29,8 +29,8 @@ __all__ = [
29
29
  "Item10Q",
30
30
  "Item8K",
31
31
  "FilingType",
32
- "MarkdownChunk",
33
- "MarkdownChunker",
32
+ "Chunk",
33
+ "Chunker",
34
34
  "Parser",
35
35
  "SectionExtractor",
36
36
  ]
@@ -73,40 +73,14 @@ class TextBlock(BaseBlock):
73
73
  return cls(content=content, page=page, block_type='Text')
74
74
 
75
75
 
76
- class AudioParagraphBlock(BaseBlock):
77
- block_type: str = Field(default="Text", description="Audio paragraph block type")
78
- paragraph_id: int = Field(..., description="Paragraph ID")
79
- audio_start: float = Field(..., description="Audio start time")
80
- audio_end: float = Field(..., description="Audio end time")
81
-
82
- @computed_field
83
- @property
84
- def sentences(self) -> List[Sentence]:
85
- """Returns the text block sentences"""
86
- return [Sentence(content=content) for content in split_sentences(self.content)]
87
-
88
- def format(self) -> dict:
89
- """Formats the audio paragraphs"""
90
- return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
91
-
92
-
93
- class TableBlock(BaseModel):
76
+ class TableBlock(BaseBlock):
94
77
  block_type: str = Field(default='Table', description="Table block type")
95
- content: str = Field(..., description="Table content")
96
- page: int = Field(..., description="Page number")
97
-
98
- model_config = {"frozen": False}
99
78
 
100
79
  def __init__(self, **data):
101
80
  if 'content' in data:
102
81
  data['content'] = self._to_minified_markdown_static(data['content'])
103
82
  super().__init__(**data)
104
83
 
105
- @computed_field
106
- @property
107
- def tokens(self) -> int:
108
- return estimate_tokens(self.content)
109
-
110
84
  @staticmethod
111
85
  def _to_minified_markdown_static(content: str) -> str:
112
86
  """Returns the table in a Minified Markdown format"""
@@ -1,7 +1,7 @@
1
1
  from typing import List, Optional, Tuple, TYPE_CHECKING
2
2
  from pydantic import BaseModel, Field, computed_field
3
3
 
4
- from sec2md.chunker.markdown_blocks import BaseBlock
4
+ from sec2md.chunker.blocks import BaseBlock
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from sec2md.models import Element
@@ -9,8 +9,8 @@ else:
9
9
  Element = 'Element' # Forward reference for Pydantic
10
10
 
11
11
 
12
- class MarkdownChunk(BaseModel):
13
- """Represents a chunk of markdown content that can be embedded"""
12
+ class Chunk(BaseModel):
13
+ """Represents a chunk of content that can be embedded"""
14
14
 
15
15
  blocks: List[BaseBlock] = Field(..., description="List of markdown blocks in this chunk")
16
16
  header: Optional[str] = Field(None, description="Optional header for embedding context")
@@ -126,7 +126,7 @@ class MarkdownChunk(BaseModel):
126
126
 
127
127
  def __repr__(self):
128
128
  pages_str = f"{self.start_page}-{self.end_page}" if self.start_page != self.end_page else str(self.start_page)
129
- return f"MarkdownChunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
129
+ return f"Chunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
130
130
 
131
131
  def _repr_markdown_(self):
132
132
  """This method is called by IPython to display as Markdown"""
@@ -1,24 +1,24 @@
1
1
  import logging
2
2
  from typing import Union, Tuple, List, Dict, Any
3
3
 
4
- from sec2md.chunker.markdown_chunk import MarkdownChunk
5
- from sec2md.chunker.markdown_blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
4
+ from sec2md.chunker.chunk import Chunk
5
+ from sec2md.chunker.blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
6
6
 
7
- # Rebuild MarkdownChunk after Element is defined
7
+ # Rebuild Chunk after Element is defined
8
8
  from sec2md.models import Element
9
- MarkdownChunk.model_rebuild()
9
+ Chunk.model_rebuild()
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
- class MarkdownChunker:
15
- """Splits markdown content into chunks"""
14
+ class Chunker:
15
+ """Splits content into chunks"""
16
16
 
17
17
  def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
18
18
  self.chunk_size = chunk_size
19
19
  self.chunk_overlap = chunk_overlap
20
20
 
21
- def split(self, pages: List[Any], header: str = None) -> List[MarkdownChunk]:
21
+ def split(self, pages: List[Any], header: str = None) -> List[Chunk]:
22
22
  """Split the pages into chunks with optional header for embedding context.
23
23
 
24
24
  Args:
@@ -26,7 +26,7 @@ class MarkdownChunker:
26
26
  header: Optional header to prepend to each chunk's embedding_text
27
27
 
28
28
  Returns:
29
- List of MarkdownChunk objects
29
+ List of Chunk objects
30
30
  """
31
31
  # Build element map: page -> List[Element objects]
32
32
  page_elements = {}
@@ -64,14 +64,14 @@ class MarkdownChunker:
64
64
  last_page = page
65
65
 
66
66
  for line in page.content.split('\n'):
67
- if table_content and not MarkdownChunker._is_table_line(line):
67
+ if table_content and not Chunker._is_table_line(line):
68
68
  blocks.append(TableBlock(content=table_content, page=page.number))
69
69
  table_content = ""
70
70
 
71
71
  if line.startswith("#"):
72
72
  blocks.append(HeaderBlock(content=line, page=page.number))
73
73
 
74
- elif MarkdownChunker._is_table_line(line):
74
+ elif Chunker._is_table_line(line):
75
75
  table_content += f"{line}\n"
76
76
 
77
77
  else:
@@ -96,7 +96,7 @@ class MarkdownChunker:
96
96
  return True
97
97
  return True
98
98
 
99
- def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[MarkdownChunk]:
99
+ def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[Chunk]:
100
100
  """Converts the blocks to chunks"""
101
101
  page_elements = page_elements or {}
102
102
  chunks = []
@@ -127,7 +127,7 @@ class MarkdownChunker:
127
127
  return chunks
128
128
 
129
129
  def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
130
- chunks: List[MarkdownChunk], header: str = None, page_elements: dict = None):
130
+ chunks: List[Chunk], header: str = None, page_elements: dict = None):
131
131
  """Process a text block by breaking it into sentences if needed"""
132
132
  sentences = []
133
133
  sentences_tokens = 0
@@ -156,7 +156,7 @@ class MarkdownChunker:
156
156
  return chunk_blocks, num_tokens, chunks
157
157
 
158
158
  def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
159
- chunks: List[MarkdownChunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
159
+ chunks: List[Chunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
160
160
  """Process a table block with optional header backtrack"""
161
161
  context = []
162
162
  context_tokens = 0
@@ -200,7 +200,7 @@ class MarkdownChunker:
200
200
  return chunk_blocks, num_tokens, chunks
201
201
 
202
202
  def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
203
- chunks: List[MarkdownChunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
203
+ chunks: List[Chunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
204
204
  """Process a header block"""
205
205
  if not chunk_blocks:
206
206
  chunk_blocks.append(block)
@@ -223,17 +223,17 @@ class MarkdownChunker:
223
223
 
224
224
  return chunk_blocks, num_tokens, chunks
225
225
 
226
- def _finalize_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str, page_elements: dict):
226
+ def _finalize_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str, page_elements: dict):
227
227
  """Create chunk with elements from the pages it spans"""
228
228
  chunk_pages = set(block.page for block in blocks)
229
229
  elements = []
230
230
  for page_num in sorted(chunk_pages):
231
231
  if page_num in page_elements:
232
232
  elements.extend(page_elements[page_num])
233
- chunks.append(MarkdownChunk(blocks=blocks, header=header, elements=elements))
233
+ chunks.append(Chunk(blocks=blocks, header=header, elements=elements))
234
234
 
235
- def _create_chunk(self, chunks: List[MarkdownChunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
236
- List[MarkdownChunk], List[BaseBlock], int]:
235
+ def _create_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
236
+ List[Chunk], List[BaseBlock], int]:
237
237
  """Creates a chunk and returns overlap blocks"""
238
238
  page_elements = page_elements or {}
239
239
  self._finalize_chunk(chunks, blocks, header, page_elements)
sec2md/chunking.py CHANGED
@@ -1,10 +1,10 @@
1
- """Chunking utilities for page-aware markdown splitting."""
1
+ """Chunking utilities for page-aware splitting."""
2
2
 
3
3
  from typing import List, Optional
4
4
  from collections import defaultdict
5
5
  from sec2md.models import Page, Section, TextBlock
6
- from sec2md.chunker.markdown_chunker import MarkdownChunker
7
- from sec2md.chunker.markdown_chunk import MarkdownChunk
6
+ from sec2md.chunker.chunker import Chunker
7
+ from sec2md.chunker.chunk import Chunk
8
8
 
9
9
 
10
10
  def chunk_pages(
@@ -12,9 +12,9 @@ def chunk_pages(
12
12
  chunk_size: int = 512,
13
13
  chunk_overlap: int = 128,
14
14
  header: Optional[str] = None
15
- ) -> List[MarkdownChunk]:
15
+ ) -> List[Chunk]:
16
16
  """
17
- Chunk pages into overlapping markdown chunks.
17
+ Chunk pages into overlapping chunks.
18
18
 
19
19
  Args:
20
20
  pages: List of Page objects (with optional elements)
@@ -23,7 +23,7 @@ def chunk_pages(
23
23
  header: Optional header to prepend to each chunk's embedding_text
24
24
 
25
25
  Returns:
26
- List of MarkdownChunk objects with page tracking and elements
26
+ List of Chunk objects with page tracking and elements
27
27
 
28
28
  Example:
29
29
  >>> pages = sec2md.convert_to_markdown(html, return_pages=True, include_elements=True)
@@ -32,7 +32,7 @@ def chunk_pages(
32
32
  ... print(f"Page {chunk.page}: {chunk.content[:100]}...")
33
33
  ... print(f"Elements: {chunk.elements}")
34
34
  """
35
- chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
35
+ chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
36
36
  return chunker.split(pages=pages, header=header)
37
37
 
38
38
 
@@ -41,9 +41,9 @@ def chunk_section(
41
41
  chunk_size: int = 512,
42
42
  chunk_overlap: int = 128,
43
43
  header: Optional[str] = None
44
- ) -> List[MarkdownChunk]:
44
+ ) -> List[Chunk]:
45
45
  """
46
- Chunk a filing section into overlapping markdown chunks.
46
+ Chunk a filing section into overlapping chunks.
47
47
 
48
48
  Args:
49
49
  section: Section object from extract_sections()
@@ -52,7 +52,7 @@ def chunk_section(
52
52
  header: Optional header to prepend to each chunk's embedding_text
53
53
 
54
54
  Returns:
55
- List of MarkdownChunk objects
55
+ List of Chunk objects
56
56
 
57
57
  Example:
58
58
  >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
@@ -79,8 +79,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
79
79
 
80
80
  Returns:
81
81
  List of merged TextBlock objects with page metadata:
82
- - page_start: First page the note appears on
83
- - page_end: Last page the note appears on
82
+ - start_page: First page the note appears on
83
+ - end_page: Last page the note appears on
84
84
  - source_pages: All pages the note spans
85
85
  - elements: All elements from all pages
86
86
 
@@ -88,7 +88,7 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
88
88
  >>> pages = parser.get_pages(include_elements=True)
89
89
  >>> merged = merge_text_blocks(pages)
90
90
  >>> for tb in merged:
91
- ... print(f"{tb.title}: pages {tb.page_start}-{tb.page_end}")
91
+ ... print(f"{tb.title}: pages {tb.start_page}-{tb.end_page}")
92
92
  Debt Disclosure: pages 45-46
93
93
  Segment Reporting: pages 49-50
94
94
  """
@@ -97,8 +97,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
97
97
  "name": None,
98
98
  "title": None,
99
99
  "elements": [],
100
- "page_start": float('inf'),
101
- "page_end": -1,
100
+ "start_page": float('inf'),
101
+ "end_page": -1,
102
102
  "pages": set()
103
103
  })
104
104
 
@@ -108,8 +108,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
108
108
  tb_map[tb.name]["name"] = tb.name
109
109
  tb_map[tb.name]["title"] = tb.title
110
110
  tb_map[tb.name]["elements"].extend(tb.elements)
111
- tb_map[tb.name]["page_start"] = min(tb_map[tb.name]["page_start"], page.number)
112
- tb_map[tb.name]["page_end"] = max(tb_map[tb.name]["page_end"], page.number)
111
+ tb_map[tb.name]["start_page"] = min(tb_map[tb.name]["start_page"], page.number)
112
+ tb_map[tb.name]["end_page"] = max(tb_map[tb.name]["end_page"], page.number)
113
113
  tb_map[tb.name]["pages"].add(page.number)
114
114
 
115
115
  # Create merged TextBlock objects
@@ -119,8 +119,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
119
119
  name=tb_data["name"],
120
120
  title=tb_data["title"],
121
121
  elements=tb_data["elements"],
122
- page_start=tb_data["page_start"],
123
- page_end=tb_data["page_end"],
122
+ start_page=tb_data["start_page"],
123
+ end_page=tb_data["end_page"],
124
124
  source_pages=sorted(tb_data["pages"])
125
125
  )
126
126
  merged.append(tb)
@@ -132,8 +132,8 @@ def chunk_text_block(
132
132
  text_block: TextBlock,
133
133
  chunk_size: int = 512,
134
134
  chunk_overlap: int = 128,
135
- include_title_as_header: bool = True
136
- ) -> List[MarkdownChunk]:
135
+ header: Optional[str] = None
136
+ ) -> List[Chunk]:
137
137
  """
138
138
  Chunk a single TextBlock (financial note).
139
139
 
@@ -141,17 +141,17 @@ def chunk_text_block(
141
141
  text_block: TextBlock object (possibly spanning multiple pages)
142
142
  chunk_size: Target chunk size in tokens (estimated as chars/4)
143
143
  chunk_overlap: Overlap between chunks in tokens
144
- include_title_as_header: Prepend note title to chunks for embedding
144
+ header: Optional header to prepend to each chunk's embedding_text
145
145
 
146
146
  Returns:
147
- List of MarkdownChunk objects with elements preserved
147
+ List of Chunk objects with elements preserved
148
148
 
149
149
  Example:
150
150
  >>> merged = merge_text_blocks(pages)
151
151
  >>> debt_note = [tb for tb in merged if "Debt" in tb.title][0]
152
- >>> chunks = chunk_text_block(debt_note, chunk_size=512)
152
+ >>> chunks = chunk_text_block(debt_note, chunk_size=512, header="Company: AAPL | Note: Debt")
153
153
  >>> print(f"Chunked {debt_note.title} into {len(chunks)} chunks")
154
- >>> print(f"Note spans pages {debt_note.page_start}-{debt_note.page_end}")
154
+ >>> print(f"Note spans pages {debt_note.start_page}-{debt_note.end_page}")
155
155
  """
156
156
  # Group elements by page
157
157
  elements_by_page = defaultdict(list)
@@ -172,8 +172,6 @@ def chunk_text_block(
172
172
  elements=elems # Only elements from this page
173
173
  ))
174
174
 
175
- # Chunk normally across all pages
176
- header = f"Note: {text_block.title}" if include_title_as_header and text_block.title else None
177
- chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
175
+ chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
178
176
 
179
177
  return chunker.split(pages=pages, header=header)
sec2md/models.py CHANGED
@@ -248,8 +248,8 @@ class TextBlock(BaseModel):
248
248
  elements: List['Element'] = Field(default_factory=list, description="Element objects in this TextBlock")
249
249
 
250
250
  # Optional: Set by merge_text_blocks() for multi-page notes
251
- page_start: Optional[int] = Field(None, description="First page this TextBlock appears on")
252
- page_end: Optional[int] = Field(None, description="Last page this TextBlock appears on")
251
+ start_page: Optional[int] = Field(None, description="First page this TextBlock appears on")
252
+ end_page: Optional[int] = Field(None, description="Last page this TextBlock appears on")
253
253
  source_pages: Optional[List[int]] = Field(None, description="All pages this TextBlock spans")
254
254
 
255
255
  model_config = {"frozen": False, "arbitrary_types_allowed": True}
@@ -261,7 +261,7 @@ class TextBlock(BaseModel):
261
261
  return [e.id for e in self.elements]
262
262
 
263
263
  def __repr__(self) -> str:
264
- pages_info = f", pages={self.page_start}-{self.page_end}" if self.page_start else ""
264
+ pages_info = f", pages={self.start_page}-{self.end_page}" if self.start_page else ""
265
265
  return f"TextBlock(name='{self.name}', title='{self.title}', elements={len(self.elements)}{pages_info})"
266
266
 
267
267
 
sec2md/parser.py CHANGED
@@ -710,7 +710,7 @@ class Parser:
710
710
 
711
711
  return current
712
712
 
713
- def get_pages(self, include_elements: bool = False) -> List[Page]:
713
+ def get_pages(self, include_elements: bool = True) -> List[Page]:
714
714
  """Get parsed pages as Page objects.
715
715
 
716
716
  Args:
@@ -143,8 +143,9 @@ class SectionExtractor:
143
143
  # ========== 8-K Specific Methods ==========
144
144
 
145
145
  # 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
146
+ # Simplified pattern: match ONLY at line start, with strict formatting
146
147
  _ITEM_8K_RE = re.compile(
147
- rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
148
+ rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)$',
148
149
  re.IGNORECASE | re.MULTILINE
149
150
  )
150
151
 
@@ -250,109 +251,206 @@ class SectionExtractor:
250
251
  end = mstop.start() if mstop else next_item_start
251
252
  return doc[start_after:end].strip()
252
253
 
254
+ def _is_8k_boilerplate_page(self, page_content: str, page_num: int) -> bool:
255
+ """Detect cover, TOC, and signature pages in 8-Ks."""
256
+ # Cover page is always page 1
257
+ if page_num == 1:
258
+ return True
259
+
260
+ # TOC page: has "TABLE OF CONTENTS" header (with or without bold markdown)
261
+ # Also detect if page has multiple ITEM entries with page numbers (TOC table pattern)
262
+ if re.search(r'TABLE OF CONTENTS', page_content, re.IGNORECASE):
263
+ return True
264
+
265
+ # Alternative TOC detection: page has multiple items with "| digit |" pattern (page numbers in table)
266
+ item_with_page_count = len(re.findall(r'ITEM\s+[1-9]\.\d{2}.*?\|\s*\d+\s*\|', page_content, re.IGNORECASE))
267
+ if item_with_page_count >= 2: # If 2+ items have page numbers, it's a TOC
268
+ return True
269
+
270
+ # Signatures page: has "SIGNATURES" header and filing signature text
271
+ if re.search(r'\*\*SIGNATURES\*\*', page_content) and \
272
+ re.search(r'Pursuant to the requirements', page_content, re.IGNORECASE):
273
+ return True
274
+
275
+ return False
276
+
253
277
  def _get_8k_sections(self) -> List[Any]:
254
- """Extract 8-K sections (items only, no PART divisions)."""
278
+ """Extract 8-K sections using page-by-page approach like standard extractor."""
255
279
  from sec2md.models import Section, Page, ITEM_8K_TITLES
256
280
 
257
- # Concatenate all pages into one doc
258
- full_content = "\n\n".join(p["content"] for p in self.pages)
259
- doc = self._clean_8k_text(full_content)
281
+ sections = []
282
+ current_item = None
283
+ current_item_title = None
284
+ current_pages: List[Dict] = []
260
285
 
261
- if not doc:
262
- self._log("DEBUG: No content after cleaning")
263
- return []
286
+ def flush_section():
287
+ nonlocal sections, current_item, current_item_title, current_pages
288
+ if current_pages and current_item:
289
+ # Parse exhibits if this is ITEM 9.01
290
+ exhibits = None
291
+ if current_item.startswith("ITEM 9.01"):
292
+ content = "\n".join(p["content"] for p in current_pages)
293
+ md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', content, re.IGNORECASE | re.MULTILINE)
294
+ ex_block = content[md.end():].strip() if md else content
295
+ parsed_exhibits = self._parse_exhibits(ex_block)
296
+ exhibits = parsed_exhibits if parsed_exhibits else None
297
+
298
+ # Convert page dicts to Page objects
299
+ page_objects = [Page(number=p["page"], content=p["content"], elements=None, text_blocks=None)
300
+ for p in current_pages]
301
+
302
+ sections.append(Section(
303
+ part=None,
304
+ item=current_item,
305
+ item_title=current_item_title,
306
+ pages=page_objects,
307
+ exhibits=exhibits
308
+ ))
309
+ current_pages = []
264
310
 
265
- # Find all item headers
266
- headers: List[Dict] = []
267
- for m in self._ITEM_8K_RE.finditer(doc):
268
- code = self._normalize_8k_item_code(m.group(2))
269
- title_inline = (m.group(3) or "").strip()
270
- # Clean markdown artifacts from title
271
- title_inline = MD_EDGE.sub("", title_inline)
272
- title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
273
- headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
274
- self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
311
+ for page_dict in self.pages:
312
+ page_num = page_dict["page"]
313
+ content = page_dict["content"]
275
314
 
276
- if not headers:
277
- self._log("DEBUG: No item headers found")
278
- return []
315
+ # Skip boilerplate pages
316
+ if self._is_8k_boilerplate_page(content, page_num):
317
+ self._log(f"DEBUG: Page {page_num} is boilerplate, skipping")
318
+ continue
279
319
 
280
- self._log(f"DEBUG: Total headers found: {len(headers)}")
320
+ # Find first valid ITEM header on this page (if any)
321
+ item_m = None
322
+ first_idx = None
281
323
 
282
- # Extract sections
283
- results: List[Section] = []
284
- for i, h in enumerate(headers):
285
- code = h["no"]
286
- next_start = headers[i + 1]["start"] if i + 1 < len(headers) else len(doc)
287
- body = self._slice_8k_body(doc, h["end"], next_start)
324
+ for m in self._ITEM_8K_RE.finditer(content):
325
+ # Get the full line for this match
326
+ line_start = content.rfind('\n', 0, m.start()) + 1
327
+ line_end = content.find('\n', m.end())
328
+ if line_end == -1:
329
+ line_end = len(content)
330
+ full_line = content[line_start:line_end].strip()
331
+
332
+ # Skip if this is a table row (contains pipe characters)
333
+ if '|' in full_line:
334
+ self._log(f"DEBUG: Page {page_num} skipping table row: {full_line[:60]}")
335
+ continue
336
+
337
+ # Get item code and title
338
+ code = self._normalize_8k_item_code(m.group(2))
339
+ title_inline = (m.group(3) or "").strip()
340
+ title_inline = MD_EDGE.sub("", title_inline)
341
+
342
+ # This is a valid ITEM header
343
+ item_m = m
344
+ first_idx = m.start()
345
+ self._log(f"DEBUG: Page {page_num} found ITEM {code} at position {first_idx}")
346
+ break
347
+
348
+ # No item header found - add to current section
349
+ if first_idx is None:
350
+ if current_item:
351
+ current_pages.append({"page": page_num, "content": content.strip()})
352
+ continue
353
+
354
+ # Found item header - split page
355
+ before = content[:first_idx].strip()
356
+ after = content[first_idx:].strip()
357
+
358
+ # Add "before" content to current section
359
+ if current_item and before:
360
+ current_pages.append({"page": page_num, "content": before})
361
+
362
+ # Flush current section
363
+ flush_section()
364
+
365
+ # Start new section
366
+ code = self._normalize_8k_item_code(item_m.group(2))
367
+ title_inline = (item_m.group(3) or "").strip()
368
+ title_inline = MD_EDGE.sub("", title_inline)
369
+ current_item = f"ITEM {code}"
370
+ current_item_title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
288
371
 
289
372
  # Filter by desired_items if provided
290
373
  if self.desired_items and code not in self.desired_items:
291
374
  self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
375
+ current_item = None
376
+ current_item_title = None
292
377
  continue
293
378
 
294
- # For 9.01, parse exhibits
295
- exhibits = []
296
- if code.startswith("9.01"):
297
- md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
298
- ex_block = body[md.end():].strip() if md else body
299
- exhibits = self._parse_exhibits(ex_block)
300
- self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
301
-
302
- # Map back to Page objects (approximate page boundaries from original content)
303
- # Since 8-K sections can span pages, we need to find which pages contain this content
304
- section_pages = self._map_8k_content_to_pages(body)
305
-
306
- # Create Section with exhibits (now part of the model)
307
- section = Section(
308
- part=None, # 8-K has no PART divisions
309
- item=f"ITEM {code}",
310
- item_title=h["title"],
311
- pages=section_pages,
312
- exhibits=exhibits if exhibits else None
313
- )
379
+ # Add "after" content to new section
380
+ if after:
381
+ current_pages.append({"page": page_num, "content": after})
314
382
 
315
- results.append(section)
316
- self._log(f"DEBUG: Extracted ITEM {code} with {len(section_pages)} pages")
383
+ # Flush final section
384
+ flush_section()
317
385
 
318
- self._log(f"DEBUG: Total sections extracted: {len(results)}")
319
- return results
386
+ self._log(f"DEBUG: Total sections extracted: {len(sections)}")
387
+ return sections
320
388
 
321
389
  def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
322
- """Map extracted section content back to Page objects."""
390
+ """Map extracted section content back to Page objects, splitting at section boundaries."""
323
391
  from sec2md.models import Page
324
392
 
325
- # Try to find which original pages contain this content
326
- # This is heuristic-based: match by content overlap
327
393
  matched_pages = []
328
- section_preview = section_content[:500] # Use first 500 chars for matching
394
+ section_content_cleaned = self._clean_8k_text(section_content)
395
+ remaining_section = section_content_cleaned
329
396
 
330
- for page_dict in self.pages:
397
+ # Use filtered pages (excludes cover, TOC, signatures)
398
+ pages_to_search = getattr(self, '_filtered_8k_pages', self.pages)
399
+
400
+ for page_dict in pages_to_search:
331
401
  page_num = page_dict["page"]
332
- page_content = self._clean_8k_text(page_dict["content"])
402
+ page_content = page_dict["content"]
403
+ page_content_cleaned = self._clean_8k_text(page_content)
333
404
 
334
- # Check if this page contains part of the section
335
- if section_preview in page_content or page_content in section_content:
336
- original_page = self._original_pages.get(page_num)
337
- matched_pages.append(
338
- Page(
339
- number=page_num,
340
- content=page_content,
341
- elements=original_page.elements if original_page else None,
342
- text_blocks=original_page.text_blocks if original_page else None
343
- )
344
- )
405
+ # Skip pages that don't contain any of the remaining section content
406
+ if not any(chunk in page_content_cleaned for chunk in remaining_section[:200].split()[:10]):
407
+ continue
408
+
409
+ # Find where the section content appears on this page
410
+ # Use the original page to preserve formatting/elements
411
+ original_page = self._original_pages[page_num]
412
+
413
+ # For 8-K, we need to split the page content at ITEM boundaries
414
+ # Find all ITEM headers on this page
415
+ item_positions = []
416
+ for m in self._ITEM_8K_RE.finditer(page_content_cleaned):
417
+ code = self._normalize_8k_item_code(m.group(2))
418
+ title = (m.group(3) or "").strip()
419
+ # Skip TOC entries
420
+ if not re.search(r'\|\s*\d+\s*\|', title):
421
+ item_positions.append((m.start(), f"ITEM {code}"))
422
+
423
+ # Find which portion of the page belongs to this section
424
+ section_start_in_page = page_content_cleaned.find(section_content_cleaned[:100])
425
+
426
+ if section_start_in_page >= 0:
427
+ # Find the end: either next ITEM on this page, or end of page
428
+ section_end_in_page = len(page_content_cleaned)
429
+ for pos, item_code in item_positions:
430
+ # Find the next ITEM after our section starts
431
+ if pos > section_start_in_page + 50: # Give 50 chars buffer
432
+ section_end_in_page = pos
433
+ break
434
+
435
+ # Extract just this section's content from the page
436
+ page_section_content = page_content_cleaned[section_start_in_page:section_end_in_page].strip()
345
437
 
346
- # If no matches found (shouldn't happen), create a synthetic page
347
- if not matched_pages:
348
- matched_pages.append(
349
- Page(
350
- number=1,
351
- content=section_content,
352
- elements=None,
438
+ # Create a new Page with only this section's content
439
+ # Note: This loses elements, but keeps the section boundary clean
440
+ matched_pages.append(Page(
441
+ number=page_num,
442
+ content=page_section_content,
443
+ elements=None, # TODO: Could filter elements by content matching
353
444
  text_blocks=None
354
- )
355
- )
445
+ ))
446
+
447
+ # Update remaining section content to find on next pages
448
+ # Remove what we've matched from the section
449
+ matched_len = len(page_section_content)
450
+ remaining_section = remaining_section[matched_len:] if matched_len < len(remaining_section) else ""
451
+
452
+ if not remaining_section.strip():
453
+ break # Found all content for this section
356
454
 
357
455
  return matched_pages
358
456
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sec2md
3
- Version: 0.1.5
3
+ Version: 0.1.12
4
4
  Summary: Convert SEC EDGAR filings to LLM-ready Markdown for AI agents and agentic RAG
5
5
  Author-email: Lucas Astorian <lucas@intellifin.ai>
6
6
  License: MIT
@@ -0,0 +1,19 @@
1
+ sec2md/__init__.py,sha256=cKVj4J_IPlcrZASlumEpjv69dMjIveatYUtPjASm1nE,988
2
+ sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
3
+ sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
4
+ sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
5
+ sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
6
+ sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
7
+ sec2md/section_extractor.py,sha256=0MqS_xluIQcI10u8-q7pk3v0uG8p8htlb4Sv0agh3Xg,30663
8
+ sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
9
+ sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
10
+ sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
11
+ sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
13
+ sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
14
+ sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
15
+ sec2md-0.1.12.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
16
+ sec2md-0.1.12.dist-info/METADATA,sha256=eSwrrLVm2fNKlpEIBKY-wm4VwKwwh7i-egy3FIfURqA,7626
17
+ sec2md-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ sec2md-0.1.12.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
19
+ sec2md-0.1.12.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- sec2md/__init__.py,sha256=iR_2g-PDkCAzY76uQwBjIVpprvkxlNopdmDduzDp8lg,1037
2
- sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
3
- sec2md/chunking.py,sha256=SQASDA057bKLhSj34GNAHrRl94Rf-A9WlfEvhhWPuIc,6350
4
- sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
5
- sec2md/models.py,sha256=H_3HnI8exGVnbqbdT1Bf4bNhPLjqvlP64ud0au5ohJk,14735
6
- sec2md/parser.py,sha256=J1He6XMa1Mf9YGJCEffWuCs7SAqi0Ts6S445CTO-lAA,47559
7
- sec2md/section_extractor.py,sha256=JTbZpPgmTipzU1Q5LehlQ9y2X4ZcQRTj3A7iMr90iqM,25976
8
- sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
9
- sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
10
- sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
11
- sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sec2md/chunker/markdown_blocks.py,sha256=yEF_v72DvYOVu0ZQ5bBCFpNM12INg-8RmajIu_dorQQ,4372
13
- sec2md/chunker/markdown_chunk.py,sha256=hCMpjn0cc5TIjWSZviq4fM7e781X3AtRcmI60pDLWro,4763
14
- sec2md/chunker/markdown_chunker.py,sha256=IYW8pQ2q9hX1lRGw4TnKAQcr-HmJfSW7wffu-BA0Jms,10743
15
- sec2md-0.1.5.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
16
- sec2md-0.1.5.dist-info/METADATA,sha256=YWQ9uiut1LcBQxOCvFcT8MlfgLO7VBCDtEju5h7fp6k,7625
17
- sec2md-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- sec2md-0.1.5.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
19
- sec2md-0.1.5.dist-info/RECORD,,