sec2md 0.1.5__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +5 -5
- sec2md/chunker/{markdown_blocks.py → blocks.py} +1 -27
- sec2md/chunker/{markdown_chunk.py → chunk.py} +4 -4
- sec2md/chunker/{markdown_chunker.py → chunker.py} +18 -18
- sec2md/chunking.py +26 -28
- sec2md/models.py +3 -3
- sec2md/parser.py +1 -1
- sec2md/section_extractor.py +63 -25
- {sec2md-0.1.5.dist-info → sec2md-0.1.10.dist-info}/METADATA +1 -1
- sec2md-0.1.10.dist-info/RECORD +19 -0
- sec2md-0.1.5.dist-info/RECORD +0 -19
- {sec2md-0.1.5.dist-info → sec2md-0.1.10.dist-info}/WHEEL +0 -0
- {sec2md-0.1.5.dist-info → sec2md-0.1.10.dist-info}/licenses/LICENSE +0 -0
- {sec2md-0.1.5.dist-info → sec2md-0.1.10.dist-info}/top_level.txt +0 -0
sec2md/__init__.py
CHANGED
|
@@ -5,12 +5,12 @@ from sec2md.utils import flatten_note
|
|
|
5
5
|
from sec2md.sections import extract_sections, get_section
|
|
6
6
|
from sec2md.chunking import chunk_pages, chunk_section, merge_text_blocks, chunk_text_block
|
|
7
7
|
from sec2md.models import Page, Section, Item10K, Item10Q, Item8K, FilingType, Element, TextBlock, Exhibit
|
|
8
|
-
from sec2md.chunker.
|
|
9
|
-
from sec2md.chunker.
|
|
8
|
+
from sec2md.chunker.chunk import Chunk
|
|
9
|
+
from sec2md.chunker.chunker import Chunker
|
|
10
10
|
from sec2md.parser import Parser
|
|
11
11
|
from sec2md.section_extractor import SectionExtractor
|
|
12
12
|
|
|
13
|
-
__version__ = "0.1.
|
|
13
|
+
__version__ = "0.1.10"
|
|
14
14
|
__all__ = [
|
|
15
15
|
"convert_to_markdown",
|
|
16
16
|
"flatten_note",
|
|
@@ -29,8 +29,8 @@ __all__ = [
|
|
|
29
29
|
"Item10Q",
|
|
30
30
|
"Item8K",
|
|
31
31
|
"FilingType",
|
|
32
|
-
"
|
|
33
|
-
"
|
|
32
|
+
"Chunk",
|
|
33
|
+
"Chunker",
|
|
34
34
|
"Parser",
|
|
35
35
|
"SectionExtractor",
|
|
36
36
|
]
|
|
@@ -73,40 +73,14 @@ class TextBlock(BaseBlock):
|
|
|
73
73
|
return cls(content=content, page=page, block_type='Text')
|
|
74
74
|
|
|
75
75
|
|
|
76
|
-
class
|
|
77
|
-
block_type: str = Field(default="Text", description="Audio paragraph block type")
|
|
78
|
-
paragraph_id: int = Field(..., description="Paragraph ID")
|
|
79
|
-
audio_start: float = Field(..., description="Audio start time")
|
|
80
|
-
audio_end: float = Field(..., description="Audio end time")
|
|
81
|
-
|
|
82
|
-
@computed_field
|
|
83
|
-
@property
|
|
84
|
-
def sentences(self) -> List[Sentence]:
|
|
85
|
-
"""Returns the text block sentences"""
|
|
86
|
-
return [Sentence(content=content) for content in split_sentences(self.content)]
|
|
87
|
-
|
|
88
|
-
def format(self) -> dict:
|
|
89
|
-
"""Formats the audio paragraphs"""
|
|
90
|
-
return {"id": self.paragraph_id, "content": self.content, "start": self.audio_start, "end": self.audio_end}
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
class TableBlock(BaseModel):
|
|
76
|
+
class TableBlock(BaseBlock):
|
|
94
77
|
block_type: str = Field(default='Table', description="Table block type")
|
|
95
|
-
content: str = Field(..., description="Table content")
|
|
96
|
-
page: int = Field(..., description="Page number")
|
|
97
|
-
|
|
98
|
-
model_config = {"frozen": False}
|
|
99
78
|
|
|
100
79
|
def __init__(self, **data):
|
|
101
80
|
if 'content' in data:
|
|
102
81
|
data['content'] = self._to_minified_markdown_static(data['content'])
|
|
103
82
|
super().__init__(**data)
|
|
104
83
|
|
|
105
|
-
@computed_field
|
|
106
|
-
@property
|
|
107
|
-
def tokens(self) -> int:
|
|
108
|
-
return estimate_tokens(self.content)
|
|
109
|
-
|
|
110
84
|
@staticmethod
|
|
111
85
|
def _to_minified_markdown_static(content: str) -> str:
|
|
112
86
|
"""Returns the table in a Minified Markdown format"""
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Optional, Tuple, TYPE_CHECKING
|
|
2
2
|
from pydantic import BaseModel, Field, computed_field
|
|
3
3
|
|
|
4
|
-
from sec2md.chunker.
|
|
4
|
+
from sec2md.chunker.blocks import BaseBlock
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from sec2md.models import Element
|
|
@@ -9,8 +9,8 @@ else:
|
|
|
9
9
|
Element = 'Element' # Forward reference for Pydantic
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
class
|
|
13
|
-
"""Represents a chunk of
|
|
12
|
+
class Chunk(BaseModel):
|
|
13
|
+
"""Represents a chunk of content that can be embedded"""
|
|
14
14
|
|
|
15
15
|
blocks: List[BaseBlock] = Field(..., description="List of markdown blocks in this chunk")
|
|
16
16
|
header: Optional[str] = Field(None, description="Optional header for embedding context")
|
|
@@ -126,7 +126,7 @@ class MarkdownChunk(BaseModel):
|
|
|
126
126
|
|
|
127
127
|
def __repr__(self):
|
|
128
128
|
pages_str = f"{self.start_page}-{self.end_page}" if self.start_page != self.end_page else str(self.start_page)
|
|
129
|
-
return f"
|
|
129
|
+
return f"Chunk(pages={pages_str}, blocks={len(self.blocks)}, tokens={self.num_tokens})"
|
|
130
130
|
|
|
131
131
|
def _repr_markdown_(self):
|
|
132
132
|
"""This method is called by IPython to display as Markdown"""
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Union, Tuple, List, Dict, Any
|
|
3
3
|
|
|
4
|
-
from sec2md.chunker.
|
|
5
|
-
from sec2md.chunker.
|
|
4
|
+
from sec2md.chunker.chunk import Chunk
|
|
5
|
+
from sec2md.chunker.blocks import BaseBlock, TextBlock, TableBlock, HeaderBlock
|
|
6
6
|
|
|
7
|
-
# Rebuild
|
|
7
|
+
# Rebuild Chunk after Element is defined
|
|
8
8
|
from sec2md.models import Element
|
|
9
|
-
|
|
9
|
+
Chunk.model_rebuild()
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class
|
|
15
|
-
"""Splits
|
|
14
|
+
class Chunker:
|
|
15
|
+
"""Splits content into chunks"""
|
|
16
16
|
|
|
17
17
|
def __init__(self, chunk_size: int = 512, chunk_overlap: int = 128):
|
|
18
18
|
self.chunk_size = chunk_size
|
|
19
19
|
self.chunk_overlap = chunk_overlap
|
|
20
20
|
|
|
21
|
-
def split(self, pages: List[Any], header: str = None) -> List[
|
|
21
|
+
def split(self, pages: List[Any], header: str = None) -> List[Chunk]:
|
|
22
22
|
"""Split the pages into chunks with optional header for embedding context.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
@@ -26,7 +26,7 @@ class MarkdownChunker:
|
|
|
26
26
|
header: Optional header to prepend to each chunk's embedding_text
|
|
27
27
|
|
|
28
28
|
Returns:
|
|
29
|
-
List of
|
|
29
|
+
List of Chunk objects
|
|
30
30
|
"""
|
|
31
31
|
# Build element map: page -> List[Element objects]
|
|
32
32
|
page_elements = {}
|
|
@@ -64,14 +64,14 @@ class MarkdownChunker:
|
|
|
64
64
|
last_page = page
|
|
65
65
|
|
|
66
66
|
for line in page.content.split('\n'):
|
|
67
|
-
if table_content and not
|
|
67
|
+
if table_content and not Chunker._is_table_line(line):
|
|
68
68
|
blocks.append(TableBlock(content=table_content, page=page.number))
|
|
69
69
|
table_content = ""
|
|
70
70
|
|
|
71
71
|
if line.startswith("#"):
|
|
72
72
|
blocks.append(HeaderBlock(content=line, page=page.number))
|
|
73
73
|
|
|
74
|
-
elif
|
|
74
|
+
elif Chunker._is_table_line(line):
|
|
75
75
|
table_content += f"{line}\n"
|
|
76
76
|
|
|
77
77
|
else:
|
|
@@ -96,7 +96,7 @@ class MarkdownChunker:
|
|
|
96
96
|
return True
|
|
97
97
|
return True
|
|
98
98
|
|
|
99
|
-
def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[
|
|
99
|
+
def _chunk_blocks(self, blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> List[Chunk]:
|
|
100
100
|
"""Converts the blocks to chunks"""
|
|
101
101
|
page_elements = page_elements or {}
|
|
102
102
|
chunks = []
|
|
@@ -127,7 +127,7 @@ class MarkdownChunker:
|
|
|
127
127
|
return chunks
|
|
128
128
|
|
|
129
129
|
def _process_text_block(self, block: TextBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
130
|
-
chunks: List[
|
|
130
|
+
chunks: List[Chunk], header: str = None, page_elements: dict = None):
|
|
131
131
|
"""Process a text block by breaking it into sentences if needed"""
|
|
132
132
|
sentences = []
|
|
133
133
|
sentences_tokens = 0
|
|
@@ -156,7 +156,7 @@ class MarkdownChunker:
|
|
|
156
156
|
return chunk_blocks, num_tokens, chunks
|
|
157
157
|
|
|
158
158
|
def _process_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
159
|
-
chunks: List[
|
|
159
|
+
chunks: List[Chunk], all_blocks: List[BaseBlock], block_idx: int, header: str = None, page_elements: dict = None):
|
|
160
160
|
"""Process a table block with optional header backtrack"""
|
|
161
161
|
context = []
|
|
162
162
|
context_tokens = 0
|
|
@@ -200,7 +200,7 @@ class MarkdownChunker:
|
|
|
200
200
|
return chunk_blocks, num_tokens, chunks
|
|
201
201
|
|
|
202
202
|
def _process_header_table_block(self, block: BaseBlock, chunk_blocks: List[BaseBlock], num_tokens: int,
|
|
203
|
-
chunks: List[
|
|
203
|
+
chunks: List[Chunk], next_block: BaseBlock, header: str = None, page_elements: dict = None):
|
|
204
204
|
"""Process a header block"""
|
|
205
205
|
if not chunk_blocks:
|
|
206
206
|
chunk_blocks.append(block)
|
|
@@ -223,17 +223,17 @@ class MarkdownChunker:
|
|
|
223
223
|
|
|
224
224
|
return chunk_blocks, num_tokens, chunks
|
|
225
225
|
|
|
226
|
-
def _finalize_chunk(self, chunks: List[
|
|
226
|
+
def _finalize_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str, page_elements: dict):
|
|
227
227
|
"""Create chunk with elements from the pages it spans"""
|
|
228
228
|
chunk_pages = set(block.page for block in blocks)
|
|
229
229
|
elements = []
|
|
230
230
|
for page_num in sorted(chunk_pages):
|
|
231
231
|
if page_num in page_elements:
|
|
232
232
|
elements.extend(page_elements[page_num])
|
|
233
|
-
chunks.append(
|
|
233
|
+
chunks.append(Chunk(blocks=blocks, header=header, elements=elements))
|
|
234
234
|
|
|
235
|
-
def _create_chunk(self, chunks: List[
|
|
236
|
-
List[
|
|
235
|
+
def _create_chunk(self, chunks: List[Chunk], blocks: List[BaseBlock], header: str = None, page_elements: dict = None) -> Tuple[
|
|
236
|
+
List[Chunk], List[BaseBlock], int]:
|
|
237
237
|
"""Creates a chunk and returns overlap blocks"""
|
|
238
238
|
page_elements = page_elements or {}
|
|
239
239
|
self._finalize_chunk(chunks, blocks, header, page_elements)
|
sec2md/chunking.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
"""Chunking utilities for page-aware
|
|
1
|
+
"""Chunking utilities for page-aware splitting."""
|
|
2
2
|
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from sec2md.models import Page, Section, TextBlock
|
|
6
|
-
from sec2md.chunker.
|
|
7
|
-
from sec2md.chunker.
|
|
6
|
+
from sec2md.chunker.chunker import Chunker
|
|
7
|
+
from sec2md.chunker.chunk import Chunk
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def chunk_pages(
|
|
@@ -12,9 +12,9 @@ def chunk_pages(
|
|
|
12
12
|
chunk_size: int = 512,
|
|
13
13
|
chunk_overlap: int = 128,
|
|
14
14
|
header: Optional[str] = None
|
|
15
|
-
) -> List[
|
|
15
|
+
) -> List[Chunk]:
|
|
16
16
|
"""
|
|
17
|
-
Chunk pages into overlapping
|
|
17
|
+
Chunk pages into overlapping chunks.
|
|
18
18
|
|
|
19
19
|
Args:
|
|
20
20
|
pages: List of Page objects (with optional elements)
|
|
@@ -23,7 +23,7 @@ def chunk_pages(
|
|
|
23
23
|
header: Optional header to prepend to each chunk's embedding_text
|
|
24
24
|
|
|
25
25
|
Returns:
|
|
26
|
-
List of
|
|
26
|
+
List of Chunk objects with page tracking and elements
|
|
27
27
|
|
|
28
28
|
Example:
|
|
29
29
|
>>> pages = sec2md.convert_to_markdown(html, return_pages=True, include_elements=True)
|
|
@@ -32,7 +32,7 @@ def chunk_pages(
|
|
|
32
32
|
... print(f"Page {chunk.page}: {chunk.content[:100]}...")
|
|
33
33
|
... print(f"Elements: {chunk.elements}")
|
|
34
34
|
"""
|
|
35
|
-
chunker =
|
|
35
|
+
chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
36
36
|
return chunker.split(pages=pages, header=header)
|
|
37
37
|
|
|
38
38
|
|
|
@@ -41,9 +41,9 @@ def chunk_section(
|
|
|
41
41
|
chunk_size: int = 512,
|
|
42
42
|
chunk_overlap: int = 128,
|
|
43
43
|
header: Optional[str] = None
|
|
44
|
-
) -> List[
|
|
44
|
+
) -> List[Chunk]:
|
|
45
45
|
"""
|
|
46
|
-
Chunk a filing section into overlapping
|
|
46
|
+
Chunk a filing section into overlapping chunks.
|
|
47
47
|
|
|
48
48
|
Args:
|
|
49
49
|
section: Section object from extract_sections()
|
|
@@ -52,7 +52,7 @@ def chunk_section(
|
|
|
52
52
|
header: Optional header to prepend to each chunk's embedding_text
|
|
53
53
|
|
|
54
54
|
Returns:
|
|
55
|
-
List of
|
|
55
|
+
List of Chunk objects
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
58
|
>>> sections = sec2md.extract_sections(pages, filing_type="10-K")
|
|
@@ -79,8 +79,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
|
|
|
79
79
|
|
|
80
80
|
Returns:
|
|
81
81
|
List of merged TextBlock objects with page metadata:
|
|
82
|
-
-
|
|
83
|
-
-
|
|
82
|
+
- start_page: First page the note appears on
|
|
83
|
+
- end_page: Last page the note appears on
|
|
84
84
|
- source_pages: All pages the note spans
|
|
85
85
|
- elements: All elements from all pages
|
|
86
86
|
|
|
@@ -88,7 +88,7 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
|
|
|
88
88
|
>>> pages = parser.get_pages(include_elements=True)
|
|
89
89
|
>>> merged = merge_text_blocks(pages)
|
|
90
90
|
>>> for tb in merged:
|
|
91
|
-
... print(f"{tb.title}: pages {tb.
|
|
91
|
+
... print(f"{tb.title}: pages {tb.start_page}-{tb.end_page}")
|
|
92
92
|
Debt Disclosure: pages 45-46
|
|
93
93
|
Segment Reporting: pages 49-50
|
|
94
94
|
"""
|
|
@@ -97,8 +97,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
|
|
|
97
97
|
"name": None,
|
|
98
98
|
"title": None,
|
|
99
99
|
"elements": [],
|
|
100
|
-
"
|
|
101
|
-
"
|
|
100
|
+
"start_page": float('inf'),
|
|
101
|
+
"end_page": -1,
|
|
102
102
|
"pages": set()
|
|
103
103
|
})
|
|
104
104
|
|
|
@@ -108,8 +108,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
|
|
|
108
108
|
tb_map[tb.name]["name"] = tb.name
|
|
109
109
|
tb_map[tb.name]["title"] = tb.title
|
|
110
110
|
tb_map[tb.name]["elements"].extend(tb.elements)
|
|
111
|
-
tb_map[tb.name]["
|
|
112
|
-
tb_map[tb.name]["
|
|
111
|
+
tb_map[tb.name]["start_page"] = min(tb_map[tb.name]["start_page"], page.number)
|
|
112
|
+
tb_map[tb.name]["end_page"] = max(tb_map[tb.name]["end_page"], page.number)
|
|
113
113
|
tb_map[tb.name]["pages"].add(page.number)
|
|
114
114
|
|
|
115
115
|
# Create merged TextBlock objects
|
|
@@ -119,8 +119,8 @@ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
|
|
|
119
119
|
name=tb_data["name"],
|
|
120
120
|
title=tb_data["title"],
|
|
121
121
|
elements=tb_data["elements"],
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
start_page=tb_data["start_page"],
|
|
123
|
+
end_page=tb_data["end_page"],
|
|
124
124
|
source_pages=sorted(tb_data["pages"])
|
|
125
125
|
)
|
|
126
126
|
merged.append(tb)
|
|
@@ -132,8 +132,8 @@ def chunk_text_block(
|
|
|
132
132
|
text_block: TextBlock,
|
|
133
133
|
chunk_size: int = 512,
|
|
134
134
|
chunk_overlap: int = 128,
|
|
135
|
-
|
|
136
|
-
) -> List[
|
|
135
|
+
header: Optional[str] = None
|
|
136
|
+
) -> List[Chunk]:
|
|
137
137
|
"""
|
|
138
138
|
Chunk a single TextBlock (financial note).
|
|
139
139
|
|
|
@@ -141,17 +141,17 @@ def chunk_text_block(
|
|
|
141
141
|
text_block: TextBlock object (possibly spanning multiple pages)
|
|
142
142
|
chunk_size: Target chunk size in tokens (estimated as chars/4)
|
|
143
143
|
chunk_overlap: Overlap between chunks in tokens
|
|
144
|
-
|
|
144
|
+
header: Optional header to prepend to each chunk's embedding_text
|
|
145
145
|
|
|
146
146
|
Returns:
|
|
147
|
-
List of
|
|
147
|
+
List of Chunk objects with elements preserved
|
|
148
148
|
|
|
149
149
|
Example:
|
|
150
150
|
>>> merged = merge_text_blocks(pages)
|
|
151
151
|
>>> debt_note = [tb for tb in merged if "Debt" in tb.title][0]
|
|
152
|
-
>>> chunks = chunk_text_block(debt_note, chunk_size=512)
|
|
152
|
+
>>> chunks = chunk_text_block(debt_note, chunk_size=512, header="Company: AAPL | Note: Debt")
|
|
153
153
|
>>> print(f"Chunked {debt_note.title} into {len(chunks)} chunks")
|
|
154
|
-
>>> print(f"Note spans pages {debt_note.
|
|
154
|
+
>>> print(f"Note spans pages {debt_note.start_page}-{debt_note.end_page}")
|
|
155
155
|
"""
|
|
156
156
|
# Group elements by page
|
|
157
157
|
elements_by_page = defaultdict(list)
|
|
@@ -172,8 +172,6 @@ def chunk_text_block(
|
|
|
172
172
|
elements=elems # Only elements from this page
|
|
173
173
|
))
|
|
174
174
|
|
|
175
|
-
|
|
176
|
-
header = f"Note: {text_block.title}" if include_title_as_header and text_block.title else None
|
|
177
|
-
chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
175
|
+
chunker = Chunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
178
176
|
|
|
179
177
|
return chunker.split(pages=pages, header=header)
|
sec2md/models.py
CHANGED
|
@@ -248,8 +248,8 @@ class TextBlock(BaseModel):
|
|
|
248
248
|
elements: List['Element'] = Field(default_factory=list, description="Element objects in this TextBlock")
|
|
249
249
|
|
|
250
250
|
# Optional: Set by merge_text_blocks() for multi-page notes
|
|
251
|
-
|
|
252
|
-
|
|
251
|
+
start_page: Optional[int] = Field(None, description="First page this TextBlock appears on")
|
|
252
|
+
end_page: Optional[int] = Field(None, description="Last page this TextBlock appears on")
|
|
253
253
|
source_pages: Optional[List[int]] = Field(None, description="All pages this TextBlock spans")
|
|
254
254
|
|
|
255
255
|
model_config = {"frozen": False, "arbitrary_types_allowed": True}
|
|
@@ -261,7 +261,7 @@ class TextBlock(BaseModel):
|
|
|
261
261
|
return [e.id for e in self.elements]
|
|
262
262
|
|
|
263
263
|
def __repr__(self) -> str:
|
|
264
|
-
pages_info = f", pages={self.
|
|
264
|
+
pages_info = f", pages={self.start_page}-{self.end_page}" if self.start_page else ""
|
|
265
265
|
return f"TextBlock(name='{self.name}', title='{self.title}', elements={len(self.elements)}{pages_info})"
|
|
266
266
|
|
|
267
267
|
|
sec2md/parser.py
CHANGED
sec2md/section_extractor.py
CHANGED
|
@@ -269,6 +269,12 @@ class SectionExtractor:
|
|
|
269
269
|
title_inline = (m.group(3) or "").strip()
|
|
270
270
|
# Clean markdown artifacts from title
|
|
271
271
|
title_inline = MD_EDGE.sub("", title_inline)
|
|
272
|
+
|
|
273
|
+
# Skip TOC entries (they have page numbers like "| 3 |" in the title)
|
|
274
|
+
if re.search(r'\|\s*\d+\s*\|', title_inline):
|
|
275
|
+
self._log(f"DEBUG: Skipping TOC entry for ITEM {code}")
|
|
276
|
+
continue
|
|
277
|
+
|
|
272
278
|
title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
|
|
273
279
|
headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
|
|
274
280
|
self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
|
|
@@ -303,6 +309,11 @@ class SectionExtractor:
|
|
|
303
309
|
# Since 8-K sections can span pages, we need to find which pages contain this content
|
|
304
310
|
section_pages = self._map_8k_content_to_pages(body)
|
|
305
311
|
|
|
312
|
+
# Skip sections with no matching pages
|
|
313
|
+
if not section_pages:
|
|
314
|
+
self._log(f"DEBUG: Skipping ITEM {code} (no pages found)")
|
|
315
|
+
continue
|
|
316
|
+
|
|
306
317
|
# Create Section with exhibits (now part of the model)
|
|
307
318
|
section = Section(
|
|
308
319
|
part=None, # 8-K has no PART divisions
|
|
@@ -319,40 +330,67 @@ class SectionExtractor:
|
|
|
319
330
|
return results
|
|
320
331
|
|
|
321
332
|
def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
|
|
322
|
-
"""Map extracted section content back to Page objects."""
|
|
333
|
+
"""Map extracted section content back to Page objects, splitting at section boundaries."""
|
|
323
334
|
from sec2md.models import Page
|
|
324
335
|
|
|
325
|
-
# Try to find which original pages contain this content
|
|
326
|
-
# This is heuristic-based: match by content overlap
|
|
327
336
|
matched_pages = []
|
|
328
|
-
|
|
337
|
+
section_content_cleaned = self._clean_8k_text(section_content)
|
|
338
|
+
remaining_section = section_content_cleaned
|
|
329
339
|
|
|
330
340
|
for page_dict in self.pages:
|
|
331
341
|
page_num = page_dict["page"]
|
|
332
|
-
page_content =
|
|
342
|
+
page_content = page_dict["content"]
|
|
343
|
+
page_content_cleaned = self._clean_8k_text(page_content)
|
|
333
344
|
|
|
334
|
-
#
|
|
335
|
-
if
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
+
# Skip pages that don't contain any of the remaining section content
|
|
346
|
+
if not any(chunk in page_content_cleaned for chunk in remaining_section[:200].split()[:10]):
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
# Find where the section content appears on this page
|
|
350
|
+
# Use the original page to preserve formatting/elements
|
|
351
|
+
original_page = self._original_pages[page_num]
|
|
352
|
+
|
|
353
|
+
# For 8-K, we need to split the page content at ITEM boundaries
|
|
354
|
+
# Find all ITEM headers on this page
|
|
355
|
+
item_positions = []
|
|
356
|
+
for m in self._ITEM_8K_RE.finditer(page_content_cleaned):
|
|
357
|
+
code = self._normalize_8k_item_code(m.group(2))
|
|
358
|
+
title = (m.group(3) or "").strip()
|
|
359
|
+
# Skip TOC entries
|
|
360
|
+
if not re.search(r'\|\s*\d+\s*\|', title):
|
|
361
|
+
item_positions.append((m.start(), f"ITEM {code}"))
|
|
362
|
+
|
|
363
|
+
# Find which portion of the page belongs to this section
|
|
364
|
+
section_start_in_page = page_content_cleaned.find(section_content_cleaned[:100])
|
|
365
|
+
|
|
366
|
+
if section_start_in_page >= 0:
|
|
367
|
+
# Find the end: either next ITEM on this page, or end of page
|
|
368
|
+
section_end_in_page = len(page_content_cleaned)
|
|
369
|
+
for pos, item_code in item_positions:
|
|
370
|
+
# Find the next ITEM after our section starts
|
|
371
|
+
if pos > section_start_in_page + 50: # Give 50 chars buffer
|
|
372
|
+
section_end_in_page = pos
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
# Extract just this section's content from the page
|
|
376
|
+
page_section_content = page_content_cleaned[section_start_in_page:section_end_in_page].strip()
|
|
345
377
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
elements=None,
|
|
378
|
+
# Create a new Page with only this section's content
|
|
379
|
+
# Note: This loses elements, but keeps the section boundary clean
|
|
380
|
+
matched_pages.append(Page(
|
|
381
|
+
number=page_num,
|
|
382
|
+
content=page_section_content,
|
|
383
|
+
elements=None, # TODO: Could filter elements by content matching
|
|
353
384
|
text_blocks=None
|
|
354
|
-
)
|
|
355
|
-
|
|
385
|
+
))
|
|
386
|
+
|
|
387
|
+
# Update remaining section content to find on next pages
|
|
388
|
+
# Remove what we've matched from the section
|
|
389
|
+
matched_len = len(page_section_content)
|
|
390
|
+
remaining_section = remaining_section[matched_len:] if matched_len < len(remaining_section) else ""
|
|
391
|
+
|
|
392
|
+
if not remaining_section.strip():
|
|
393
|
+
break # Found all content for this section
|
|
356
394
|
|
|
357
395
|
return matched_pages
|
|
358
396
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
sec2md/__init__.py,sha256=WHduz6dNVQ_pLZ-OMs-9ikWD8Brc0HdHh1sfo_ygQYU,988
|
|
2
|
+
sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
|
|
3
|
+
sec2md/chunking.py,sha256=OUjVffiqrHkFakFGjCZffE88G_jhIu9RBpVEbliF9jU,6115
|
|
4
|
+
sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
|
|
5
|
+
sec2md/models.py,sha256=zZNRp4S7pI_KHRSQwA04uSNYpDej-OzYW3S-mX2Irmc,14735
|
|
6
|
+
sec2md/parser.py,sha256=-uyorKhrXrn_3dKMqq4peo2bdxcGvkQVHI5riSXX7z4,47558
|
|
7
|
+
sec2md/section_extractor.py,sha256=otx4RObfNqP1zStilis9z4gDXp4mkN-9-tzIMACEIaE,28050
|
|
8
|
+
sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
|
|
9
|
+
sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
|
|
10
|
+
sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
|
|
11
|
+
sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
sec2md/chunker/blocks.py,sha256=LiPV0GX0LYGkV-3kfxeBA9OCmMVjOjrwL46PH8snXw4,3388
|
|
13
|
+
sec2md/chunker/chunk.py,sha256=eF7QAOita6AW_sp2Sg69853ZOH7npwM5o-AEem62RRk,4729
|
|
14
|
+
sec2md/chunker/chunker.py,sha256=_VhrxfSCarnPGIx6LHIurgCEiwH3Tz7kVZuECgTNw2w,10588
|
|
15
|
+
sec2md-0.1.10.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
|
|
16
|
+
sec2md-0.1.10.dist-info/METADATA,sha256=xW9Jin_IALBKHTlFzHnY9inkHmKLmf9jCio5jYc-EnY,7626
|
|
17
|
+
sec2md-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
sec2md-0.1.10.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
|
|
19
|
+
sec2md-0.1.10.dist-info/RECORD,,
|
sec2md-0.1.5.dist-info/RECORD
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
sec2md/__init__.py,sha256=iR_2g-PDkCAzY76uQwBjIVpprvkxlNopdmDduzDp8lg,1037
|
|
2
|
-
sec2md/absolute_table_parser.py,sha256=rphc5_HttniV2RtPCThQ68HWyyZIn9l-gkaFsbtQXBU,22982
|
|
3
|
-
sec2md/chunking.py,sha256=SQASDA057bKLhSj34GNAHrRl94Rf-A9WlfEvhhWPuIc,6350
|
|
4
|
-
sec2md/core.py,sha256=hmdJXitoEWuekR5f3B1oEK1xmPux0t494lOpg5aJrRk,2663
|
|
5
|
-
sec2md/models.py,sha256=H_3HnI8exGVnbqbdT1Bf4bNhPLjqvlP64ud0au5ohJk,14735
|
|
6
|
-
sec2md/parser.py,sha256=J1He6XMa1Mf9YGJCEffWuCs7SAqi0Ts6S445CTO-lAA,47559
|
|
7
|
-
sec2md/section_extractor.py,sha256=JTbZpPgmTipzU1Q5LehlQ9y2X4ZcQRTj3A7iMr90iqM,25976
|
|
8
|
-
sec2md/sections.py,sha256=wtmKqF_KP_G-7_qAxGvxs25U_4vcH5NDGn14ouEy5GE,2784
|
|
9
|
-
sec2md/table_parser.py,sha256=FhR8OwX5NAJmzdbTFzHQTGUNUPieYN37UzMFbQMkogU,12551
|
|
10
|
-
sec2md/utils.py,sha256=2lUeN5irTbdIyjylCkaPKMv4ALWxWMJl96PTO8FV3Ik,2990
|
|
11
|
-
sec2md/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
sec2md/chunker/markdown_blocks.py,sha256=yEF_v72DvYOVu0ZQ5bBCFpNM12INg-8RmajIu_dorQQ,4372
|
|
13
|
-
sec2md/chunker/markdown_chunk.py,sha256=hCMpjn0cc5TIjWSZviq4fM7e781X3AtRcmI60pDLWro,4763
|
|
14
|
-
sec2md/chunker/markdown_chunker.py,sha256=IYW8pQ2q9hX1lRGw4TnKAQcr-HmJfSW7wffu-BA0Jms,10743
|
|
15
|
-
sec2md-0.1.5.dist-info/licenses/LICENSE,sha256=uJDiSGQ5TOx-PGhu2LGH4A-O53vS5hrQ5sc3j2Ps_Rk,1071
|
|
16
|
-
sec2md-0.1.5.dist-info/METADATA,sha256=YWQ9uiut1LcBQxOCvFcT8MlfgLO7VBCDtEju5h7fp6k,7625
|
|
17
|
-
sec2md-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
-
sec2md-0.1.5.dist-info/top_level.txt,sha256=Jpmw3laEWwS9fljtAEg4sExjFw3zP8dGarjIknyh1v8,7
|
|
19
|
-
sec2md-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|