datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book.py +16 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +278 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.0.dist-info/METADATA +27 -0
- datamule-1.0.0.dist-info/RECORD +40 -0
- {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/__init__.py +0 -0
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.380.dist-info/METADATA +0 -110
- datamule-0.380.dist-info/RECORD +0 -61
- {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,256 +0,0 @@
|
|
1
|
-
|
2
|
-
# AI generated slop. Quick workaround to get mulebot server artifact to build correctly. Will rewrite later.
|
3
|
-
import re
|
4
|
-
|
5
|
-
def create_valid_id(title):
|
6
|
-
# Remove any characters that are not alphanumeric, hyphen, underscore, colon, or period
|
7
|
-
valid_id = re.sub(r'[^\w\-.:]+', '-', title)
|
8
|
-
# Ensure the id starts with a letter
|
9
|
-
if not valid_id[0].isalpha():
|
10
|
-
valid_id = 'section-' + valid_id
|
11
|
-
# Convert to lowercase
|
12
|
-
return valid_id.lower()
|
13
|
-
|
14
|
-
def create_content(content, level=1):
|
15
|
-
html = ""
|
16
|
-
for index, item in enumerate(content):
|
17
|
-
if 'title' in item:
|
18
|
-
section_id = create_valid_id(item['title'])
|
19
|
-
else:
|
20
|
-
section_id = f'section-{level}-{index}'
|
21
|
-
|
22
|
-
html += f'<div class="section level-{level}" id="{section_id}">'
|
23
|
-
|
24
|
-
if 'title' in item:
|
25
|
-
html += f'<h3 class="section-title">{item["title"]}</h3>'
|
26
|
-
if 'text' in item:
|
27
|
-
html += f'<p class="section-text">{item["text"]}</p>'
|
28
|
-
|
29
|
-
if 'content' in item:
|
30
|
-
html += '<div class="sub-content">'
|
31
|
-
html += create_content(item['content'], level + 1)
|
32
|
-
html += '</div>'
|
33
|
-
|
34
|
-
html += '</div>'
|
35
|
-
|
36
|
-
return html
|
37
|
-
|
38
|
-
def json_to_html(data):
|
39
|
-
html = '<div class="dashboard-container">'
|
40
|
-
|
41
|
-
# Sidebar
|
42
|
-
html += '''
|
43
|
-
<div class="sidebar" id="sidebar">
|
44
|
-
<h2 class="sidebar-title">Sections</h2>
|
45
|
-
<ul id="section-list" class="nav flex-column"></ul>
|
46
|
-
</div>
|
47
|
-
'''
|
48
|
-
|
49
|
-
# Main content
|
50
|
-
html += '<div class="main-content">'
|
51
|
-
|
52
|
-
# Header
|
53
|
-
html += f'''
|
54
|
-
<header class="dashboard-header">
|
55
|
-
<h1 class="dashboard-title">Filing Viewer</h1>
|
56
|
-
<p class="dashboard-subtitle">CIK: {data['cik']} | Accession Number: {data['accession_number']}</p>
|
57
|
-
</header>
|
58
|
-
'''
|
59
|
-
|
60
|
-
# Document content
|
61
|
-
for doc in data['document']:
|
62
|
-
if 'content' in doc:
|
63
|
-
html += create_content(doc['content'])
|
64
|
-
|
65
|
-
html += '</div></div>'
|
66
|
-
|
67
|
-
return html
|
68
|
-
|
69
|
-
def create_interactive_filing(json_data):
|
70
|
-
html = f'''
|
71
|
-
<!DOCTYPE html>
|
72
|
-
<html lang="en">
|
73
|
-
<head>
|
74
|
-
<meta charset="utf-8">
|
75
|
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
76
|
-
<title>SEC 10-K Premium Executive Dashboard</title>
|
77
|
-
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.0/css/bootstrap.min.css">
|
78
|
-
<style>
|
79
|
-
:root {{
|
80
|
-
--primary-color: #333;
|
81
|
-
--secondary-color: #666;
|
82
|
-
--background-color: #f8f8f8;
|
83
|
-
--text-color: #333;
|
84
|
-
--border-color: #e0e0e0;
|
85
|
-
}}
|
86
|
-
body {{
|
87
|
-
font-family: 'Arial', sans-serif;
|
88
|
-
background-color: var(--background-color);
|
89
|
-
color: var(--text-color);
|
90
|
-
line-height: 1.6;
|
91
|
-
}}
|
92
|
-
.dashboard-container {{
|
93
|
-
display: flex;
|
94
|
-
min-height: 100vh;
|
95
|
-
}}
|
96
|
-
.sidebar {{
|
97
|
-
width: 250px;
|
98
|
-
background-color: white;
|
99
|
-
border-right: 1px solid var(--border-color);
|
100
|
-
padding: 20px;
|
101
|
-
position: fixed;
|
102
|
-
height: 100vh;
|
103
|
-
overflow-y: auto;
|
104
|
-
transition: transform 0.3s ease-in-out;
|
105
|
-
}}
|
106
|
-
.main-content {{
|
107
|
-
flex-grow: 1;
|
108
|
-
margin-left: 250px;
|
109
|
-
padding: 40px;
|
110
|
-
}}
|
111
|
-
.dashboard-header {{
|
112
|
-
margin-bottom: 40px;
|
113
|
-
}}
|
114
|
-
.dashboard-title {{
|
115
|
-
color: var(--primary-color);
|
116
|
-
font-weight: bold;
|
117
|
-
font-size: 2.5rem;
|
118
|
-
margin-bottom: 10px;
|
119
|
-
}}
|
120
|
-
.dashboard-subtitle {{
|
121
|
-
color: var(--secondary-color);
|
122
|
-
font-size: 1rem;
|
123
|
-
}}
|
124
|
-
.section {{
|
125
|
-
margin-bottom: 30px;
|
126
|
-
padding-left: 20px;
|
127
|
-
border-left: 2px solid var(--border-color);
|
128
|
-
}}
|
129
|
-
.section-title {{
|
130
|
-
color: var(--primary-color);
|
131
|
-
font-size: 1.5rem;
|
132
|
-
margin-bottom: 15px;
|
133
|
-
}}
|
134
|
-
.level-2 {{ margin-left: 20px; }}
|
135
|
-
.level-3 {{ margin-left: 40px; }}
|
136
|
-
.nav-link {{
|
137
|
-
color: var(--text-color);
|
138
|
-
transition: all 0.3s ease;
|
139
|
-
padding: 5px 10px;
|
140
|
-
margin-bottom: 5px;
|
141
|
-
border-radius: 4px;
|
142
|
-
}}
|
143
|
-
.nav-link:hover, .nav-link.active {{
|
144
|
-
background-color: var(--background-color);
|
145
|
-
color: var(--primary-color);
|
146
|
-
}}
|
147
|
-
.sidebar-title {{
|
148
|
-
font-size: 1.2rem;
|
149
|
-
color: var(--primary-color);
|
150
|
-
margin-bottom: 20px;
|
151
|
-
padding-bottom: 10px;
|
152
|
-
border-bottom: 1px solid var(--border-color);
|
153
|
-
}}
|
154
|
-
.toggle-sidebar {{
|
155
|
-
display: none;
|
156
|
-
position: fixed;
|
157
|
-
top: 10px;
|
158
|
-
left: 10px;
|
159
|
-
z-index: 1000;
|
160
|
-
background-color: var(--primary-color);
|
161
|
-
color: white;
|
162
|
-
border: none;
|
163
|
-
padding: 10px;
|
164
|
-
border-radius: 5px;
|
165
|
-
}}
|
166
|
-
@media (max-width: 768px) {{
|
167
|
-
.sidebar {{
|
168
|
-
transform: translateX(-100%);
|
169
|
-
z-index: 1000;
|
170
|
-
}}
|
171
|
-
.sidebar.active {{
|
172
|
-
transform: translateX(0);
|
173
|
-
}}
|
174
|
-
.main-content {{
|
175
|
-
margin-left: 0;
|
176
|
-
padding: 20px;
|
177
|
-
}}
|
178
|
-
.toggle-sidebar {{
|
179
|
-
display: block;
|
180
|
-
}}
|
181
|
-
.dashboard-title {{
|
182
|
-
font-size: 2rem;
|
183
|
-
}}
|
184
|
-
.section {{
|
185
|
-
padding-left: 10px;
|
186
|
-
}}
|
187
|
-
.level-2, .level-3 {{
|
188
|
-
margin-left: 10px;
|
189
|
-
}}
|
190
|
-
}}
|
191
|
-
</style>
|
192
|
-
</head>
|
193
|
-
<body>
|
194
|
-
<button class="toggle-sidebar" id="toggleSidebar">☰</button>
|
195
|
-
{json_to_html(json_data)}
|
196
|
-
<script>
|
197
|
-
document.addEventListener('DOMContentLoaded', (event) => {{
|
198
|
-
const sidebar = document.getElementById('sidebar');
|
199
|
-
const toggleSidebar = document.getElementById('toggleSidebar');
|
200
|
-
const sections = document.querySelectorAll('.section');
|
201
|
-
const sectionList = document.getElementById('section-list');
|
202
|
-
|
203
|
-
toggleSidebar.addEventListener('click', () => {{
|
204
|
-
sidebar.classList.toggle('active');
|
205
|
-
}});
|
206
|
-
|
207
|
-
sections.forEach((section, index) => {{
|
208
|
-
const title = section.querySelector('.section-title');
|
209
|
-
if (title) {{
|
210
|
-
const listItem = document.createElement('li');
|
211
|
-
const link = document.createElement('a');
|
212
|
-
link.href = `#${{section.id}}`;
|
213
|
-
link.className = 'nav-link';
|
214
|
-
link.textContent = title.textContent;
|
215
|
-
listItem.appendChild(link);
|
216
|
-
sectionList.appendChild(listItem);
|
217
|
-
|
218
|
-
link.addEventListener('click', (e) => {{
|
219
|
-
e.preventDefault();
|
220
|
-
section.scrollIntoView({{behavior: 'smooth'}});
|
221
|
-
if (window.innerWidth <= 768) {{
|
222
|
-
sidebar.classList.remove('active');
|
223
|
-
}}
|
224
|
-
}});
|
225
|
-
}}
|
226
|
-
}});
|
227
|
-
|
228
|
-
const observerOptions = {{
|
229
|
-
root: null,
|
230
|
-
rootMargin: '0px',
|
231
|
-
threshold: 0.5
|
232
|
-
}};
|
233
|
-
|
234
|
-
const observer = new IntersectionObserver((entries) => {{
|
235
|
-
entries.forEach(entry => {{
|
236
|
-
if (entry.isIntersecting) {{
|
237
|
-
const id = entry.target.id;
|
238
|
-
document.querySelectorAll('.nav-link').forEach(navLink => {{
|
239
|
-
navLink.classList.remove('active');
|
240
|
-
if (navLink.getAttribute('href') === `#${{id}}`) {{
|
241
|
-
navLink.classList.add('active');
|
242
|
-
}}
|
243
|
-
}});
|
244
|
-
}}
|
245
|
-
}});
|
246
|
-
}}, observerOptions);
|
247
|
-
|
248
|
-
sections.forEach(section => {{
|
249
|
-
observer.observe(section);
|
250
|
-
}});
|
251
|
-
}});
|
252
|
-
</script>
|
253
|
-
</body>
|
254
|
-
</html>
|
255
|
-
'''
|
256
|
-
return html
|
datamule/global_vars.py
DELETED
@@ -1,202 +0,0 @@
|
|
1
|
-
headers = {
|
2
|
-
"User-Agent": "Peter Smith petersmith@gmail.com" # Replace with your information
|
3
|
-
}
|
4
|
-
|
5
|
-
dataset_10k_url_list = [
|
6
|
-
{'year': 2024, 'urls': [
|
7
|
-
'https://www.dropbox.com/scl/fi/3gd9whn8qtychbxuxnbsa/2024_archive.zip.001?rlkey=2n8qwhcccevniqkvy39ksa467&st=hn3kacs6&dl=1',
|
8
|
-
'https://www.dropbox.com/scl/fi/8citjlh4h58speyag3hd9/2024_archive.zip.002?rlkey=ymadt6wc81e9m3a15znwum7s1&st=opzcpxye&dl=1'
|
9
|
-
]},
|
10
|
-
{'year': 2023, 'urls': [
|
11
|
-
'https://www.dropbox.com/scl/fi/hdnb6bbr7l3xgrfmc73ht/2023_archive.zip.001?rlkey=kd0npzwvscacfdz0syq2irnu7&st=nzmh3lwr&dl=1',
|
12
|
-
'https://www.dropbox.com/scl/fi/ubiyq3tssa95enbb8xi9u/2023_archive.zip.002?rlkey=xkef3tx3q5a4f3oh38tx4cjy4&st=z3nrs8g3&dl=1'
|
13
|
-
]},
|
14
|
-
{'year': 2022, 'urls': [
|
15
|
-
'https://www.dropbox.com/scl/fi/rlhvogepk9cpnohhq4gs7/2022_archive.zip.001?rlkey=81hmjgdt1rtjub64wrlp9oy5t&st=i6ecnbux&dl=1',
|
16
|
-
'https://www.dropbox.com/scl/fi/r5m6y1j8uf02uy61u3fcn/2022_archive.zip.002?rlkey=z80qlgjifbtf5mjuqlu98478p&st=7wqvhekh&dl=1'
|
17
|
-
]},
|
18
|
-
{'year': 2021, 'urls': [
|
19
|
-
'https://www.dropbox.com/scl/fi/wemvdqxsqddlhlcgon36g/2021_archive.zip.001?rlkey=tjl3525vn60zwosnqdgznecj5&st=66bycsgf&dl=1',
|
20
|
-
'https://www.dropbox.com/scl/fi/si0nynzxxf31kxpxobzrf/2021_archive.zip.002?rlkey=93oczu6hs5iusex2f65k2mxc7&st=x8cymp6w&dl=1'
|
21
|
-
]},
|
22
|
-
{'year': 2020, 'urls': [
|
23
|
-
'https://www.dropbox.com/scl/fi/vxvgwrw2q04qlj5m2aoog/2020_archive.zip.001?rlkey=88h3x78axn5ghvk9t5otqpdjd&st=72xwi1y1&dl=1',
|
24
|
-
'https://www.dropbox.com/scl/fi/9blysoqztxg5vedrf2l2i/2020_archive.zip.002?rlkey=msvos1omcb8fowb4q1nm38m6e&st=bscfunry&dl=1'
|
25
|
-
]},
|
26
|
-
{'year': 2019, 'urls': [
|
27
|
-
'https://www.dropbox.com/scl/fi/hq5o9zo8xrqmd7l4o06hy/2019_archive.zip.001?rlkey=sazeziru87k7qptqhxenv0d6m&st=241jmwwd&dl=1',
|
28
|
-
'https://www.dropbox.com/scl/fi/2jyxw65unxhhsk5fuhuon/2019_archive.zip.002?rlkey=nzyf1em08qgxdhpz2vuoj417u&st=ii9zpdxi&dl=1'
|
29
|
-
]},
|
30
|
-
{'year': 2018, 'urls': [
|
31
|
-
'https://www.dropbox.com/scl/fi/c1vexzflxr6qcsg25nxp7/2018_archive.zip.001?rlkey=hnb5zeashbtqfhxsnf9vt94vv&st=wy9i633f&dl=1',
|
32
|
-
'https://www.dropbox.com/scl/fi/yzt3464lscpmy5n39olk5/2018_archive.zip.002?rlkey=tu3lbnjnd1xwni8f6nfpbmtgm&st=c0zur5sz&dl=1'
|
33
|
-
]},
|
34
|
-
{'year': 2017, 'urls': [
|
35
|
-
'https://www.dropbox.com/scl/fi/3trjwjx6v64ilnt8nyp02/2017_archive.zip.001?rlkey=vl4x1rrp0fisjy3djrraayjoe&st=ept0d24k&dl=1',
|
36
|
-
'https://www.dropbox.com/scl/fi/p011jrntmkrmlb9u84k62/2017_archive.zip.002?rlkey=55uka4y2d90eb5d8lgu86yl6c&st=ildtcc94&dl=1'
|
37
|
-
]},
|
38
|
-
{'year': 2016, 'urls': [
|
39
|
-
'https://www.dropbox.com/scl/fi/5oydfbume2mxqfobn2e9r/2016_archive.zip.001?rlkey=4h76gl9ny8e7vgcdnphf7bzn9&st=jkr0ioby&dl=1',
|
40
|
-
'https://www.dropbox.com/scl/fi/faofea4f2mkzjslt12s0b/2016_archive.zip.002?rlkey=bolnuqm3fq7yrfqhf5ek92dgp&st=33w8ivrx&dl=1'
|
41
|
-
]},
|
42
|
-
{'year': 2015, 'urls': [
|
43
|
-
'https://www.dropbox.com/scl/fi/75rdrrsrgbg95qcedcr65/2015_archive.zip.001?rlkey=pb4ec6sda3ii0lnzua4enxnr3&st=t7wkjb60&dl=1',
|
44
|
-
'https://www.dropbox.com/scl/fi/ixfttx508tp8cuf3xismr/2015_archive.zip.002?rlkey=xcoqtcx3vjnh3ctxhpqe4jv2j&st=56fgbb8w&dl=1'
|
45
|
-
]},
|
46
|
-
{'year': 2014, 'urls': [
|
47
|
-
'https://www.dropbox.com/scl/fi/1y1j6ct6mox76euu38t2c/2014_archive.zip.001?rlkey=hwh83ttl3nahb1oegib05p3k7&st=d01umhdp&dl=1',
|
48
|
-
'https://www.dropbox.com/scl/fi/bh2yu3coqcshj5mybk3wd/2014_archive.zip.002?rlkey=0g4ftzhytyn3vk8kgwu72b6lf&st=jz9pzdoy&dl=1'
|
49
|
-
]},
|
50
|
-
{'year': 2013, 'urls': [
|
51
|
-
'https://www.dropbox.com/scl/fi/jraed38u18c9y16mwcnmo/2013_archive.zip.001?rlkey=fvy6flk8uxk2mn5wjvynu96ag&st=3sivwbx7&dl=1',
|
52
|
-
'https://www.dropbox.com/scl/fi/cgi8opfbnu727seazzmvd/2013_archive.zip.002?rlkey=sm7h7wfzud22u3ed1pw8fr7u9&st=19tunve8&dl=1'
|
53
|
-
]},
|
54
|
-
{'year': 2012, 'urls': [
|
55
|
-
'https://www.dropbox.com/scl/fi/hji2bb1ce2wdwf5yc6dyf/2012_archive.zip.001?rlkey=0r53m8roo6e8grqez3lnhpayk&st=1jx5jq4r&dl=1',
|
56
|
-
'https://www.dropbox.com/scl/fi/hqoh4l305b168619eytkj/2012_archive.zip.002?rlkey=2laeldqzlwskwoha9idmioolf&st=1w8zowyp&dl=1'
|
57
|
-
]},
|
58
|
-
{'year': 2011, 'urls': [
|
59
|
-
'https://www.dropbox.com/scl/fi/z7z8qnmf73hqr33b386zu/2011_archive.zip.001?rlkey=kdkd3urxmo830n30gwiapqvkz&st=2hsuxpcm&dl=1',
|
60
|
-
'https://www.dropbox.com/scl/fi/illd2qfsj2vuy4yjd13el/2011_archive.zip.002?rlkey=oewcg57c92wlbufwhon21mjeq&st=ir05xure&dl=1'
|
61
|
-
]},
|
62
|
-
{'year': 2010, 'urls': [
|
63
|
-
'https://www.dropbox.com/scl/fi/j41ta06g0fso473x4oa1f/2010_archive.zip.001?rlkey=1r83ibenn06fxs6zhm6oi46pr&st=iia9qtid&dl=1',
|
64
|
-
'https://www.dropbox.com/scl/fi/31b6huoywrrc44b76wm1w/2010_archive.zip.002?rlkey=40jfl7zqnw5sikgd4wuo1095m&st=igy563mu&dl=1'
|
65
|
-
]},
|
66
|
-
{'year': 2009, 'urls': [
|
67
|
-
'https://www.dropbox.com/scl/fi/4y6c1icwvkjwwqbgx4w1a/2009_archive.zip.001?rlkey=3qqp4ikinplktw6g39x68rdmj&st=dcg0eik1&dl=1',
|
68
|
-
'https://www.dropbox.com/scl/fi/jq808ah0j0vg1sqdmvnnm/2009_archive.zip.002?rlkey=wbgtvj8fkpgmcj5oxpj88jfog&st=4txza2fu&dl=1'
|
69
|
-
]},
|
70
|
-
{'year': 2008, 'urls': [
|
71
|
-
'https://www.dropbox.com/scl/fi/uf1ym44ns1936uj8vqfwk/2008_archive.zip.001?rlkey=9rivaprk7yjrutdfu7sqo3jxv&st=gj75o1ne&dl=1'
|
72
|
-
]},
|
73
|
-
{'year': 2007, 'urls': [
|
74
|
-
'https://www.dropbox.com/scl/fi/58qmwj7m3rrl7kr00lwat/2007_archive.zip.001?rlkey=815zw0gnb7gowcdf0iuvtctqh&st=7cajp3ii&dl=1'
|
75
|
-
]},
|
76
|
-
{'year': 2006, 'urls': [
|
77
|
-
'https://www.dropbox.com/scl/fi/gxi8qzpz53f9qcvn2hpl5/2006_archive.zip.001?rlkey=hmehz1azpbxzpw6j1wy5eppq0&st=czxrdcnh&dl=1'
|
78
|
-
]},
|
79
|
-
{'year': 2005, 'urls': [
|
80
|
-
'https://www.dropbox.com/scl/fi/tjstefvwfzs3p0a1vzlbg/2005_archive.zip.001?rlkey=825m16ziekd9mwc3ybvjvisj0&st=4dyy121i&dl=1'
|
81
|
-
]},
|
82
|
-
{'year': 2004, 'urls': [
|
83
|
-
'https://www.dropbox.com/scl/fi/2g25emvme8gqoxnv5fhla/2004_archive.zip.001?rlkey=lz8oyniqgc7xvn343d39600ic&st=yvmi9h1t&dl=1'
|
84
|
-
]},
|
85
|
-
{'year': 2003, 'urls': [
|
86
|
-
'https://www.dropbox.com/scl/fi/d2ub0o4sqo0b2evd9s9z3/2003_archive.zip.001?rlkey=qxsltqjfxnk0xrp0qx5c49v57&st=cc4dzjo2&dl=1'
|
87
|
-
]},
|
88
|
-
{'year': 2002, 'urls': [
|
89
|
-
'https://www.dropbox.com/scl/fi/9xy6y09y2b5zp4w4c7cty/2002_archive.zip.001?rlkey=c4qqact06zz7ykmfc1n5odf2a&st=d93mptu8&dl=1'
|
90
|
-
]},
|
91
|
-
{'year': 2001, 'urls': [
|
92
|
-
'https://www.dropbox.com/scl/fi/b3miae1kvths87e0cq8fs/2001_archive.zip.001?rlkey=g21mnbzicju3czney275bpjyp&st=0nji6q8l&dl=1'
|
93
|
-
]}
|
94
|
-
]
|
95
|
-
|
96
|
-
dataset_10q_url_list = [
|
97
|
-
{'year': 2001, 'urls': ['https://www.dropbox.com/scl/fi/1bzig8wabbtezfg6dipou/2001_archive.zip.001?rlkey=4sddwex8k9kd4jahypsxgvbs2&st=s5x7wnh5&dl=1']},
|
98
|
-
{'year': 2002, 'urls': ['https://www.dropbox.com/scl/fi/qe0om30w0dhtg3byse8n1/2002_archive.zip.001?rlkey=7ji21x7ppavstwoe1dz7028r5&st=hb5dkzo2&dl=1']},
|
99
|
-
{'year': 2003, 'urls': ['https://www.dropbox.com/scl/fi/9dfpof4es1kfdttpejkb1/2003_archive.zip.001?rlkey=36xwaacvtb3rw8mqkd1dwgav2&st=vfmo1tph&dl=1']},
|
100
|
-
{'year': 2004, 'urls': ['https://www.dropbox.com/scl/fi/l2tv1ywmltlx2ygbmy1k4/2004_archive.zip.001?rlkey=sdrczfb9irv9q2xi10a2y169j&st=6a1v0pos&dl=1']},
|
101
|
-
{'year': 2005, 'urls': ['https://www.dropbox.com/scl/fi/16euq6ies55c0q4z2ws0q/2005_archive.zip.001?rlkey=l80ig2irksajd7djmlv6bith0&st=yc5ing9t&dl=1']},
|
102
|
-
{'year': 2006, 'urls': ['https://www.dropbox.com/scl/fi/wi66433i8xdh3g6ozozod/2006_archive.zip.001?rlkey=zg09b09mdg77ni8zsq7p8dex9&st=347utf3x&dl=1']},
|
103
|
-
{'year': 2007, 'urls': ['https://www.dropbox.com/scl/fi/u3bazimzkkps8qfvaubxm/2007_archive.zip.001?rlkey=fuj28imnb2bjskx2wggoyuvuu&st=76jka6tc&dl=1']},
|
104
|
-
{'year': 2008, 'urls': [
|
105
|
-
'https://www.dropbox.com/scl/fi/htc6j3c9l17ey6urjzm63/2008_archive.zip.001?rlkey=9pnl5066d33x6wan8uqhvom5q&st=fsgogqya&dl=1',
|
106
|
-
'https://www.dropbox.com/scl/fi/47jb2sipfg13b5p6dzegb/2008_archive.zip.002?rlkey=ml6zfxrptg1jgebpd6bdmztej&st=3unkdpnr&dl=1'
|
107
|
-
]},
|
108
|
-
{'year': 2009, 'urls': [
|
109
|
-
'https://www.dropbox.com/scl/fi/krrc6zx5cvbyhhskrrdlp/2009_archive.zip.001?rlkey=nggwp1z5ekrvgnxlg434vtfqg&st=mn6x1fqp&dl=1',
|
110
|
-
'https://www.dropbox.com/scl/fi/w46430sm52bd1bioc94f7/2009_archive.zip.002?rlkey=06ilznlorppqmhpj17wax1id9&st=ozloyl3v&dl=1'
|
111
|
-
]},
|
112
|
-
{'year': 2010, 'urls': [
|
113
|
-
'https://www.dropbox.com/scl/fi/g86fzg6dysnt34raq352k/2010_archive.zip.001?rlkey=2kil6s78cj6p5bk8r0eptxygf&st=vyauu3sl&dl=1',
|
114
|
-
'https://www.dropbox.com/scl/fi/46ttnl8pb1qfk5icd7n4q/2010_archive.zip.002?rlkey=9nok0mg3mjexybywq1og6vdux&st=oia2g4fy&dl=1'
|
115
|
-
]},
|
116
|
-
{'year': 2011, 'urls': [
|
117
|
-
'https://www.dropbox.com/scl/fi/ems2oygr0u3voq38yisuk/2011_archive.zip.001?rlkey=4lfsowv9o6wmkmozn3pdr80sh&st=3kn4ghbe&dl=1',
|
118
|
-
'https://www.dropbox.com/scl/fi/ihersfxuqnnli1fhze9wc/2011_archive.zip.002?rlkey=iirqvy919yv3pkvem2owsdgxy&st=0eb8rtbq&dl=1'
|
119
|
-
]},
|
120
|
-
{'year': 2012, 'urls': [
|
121
|
-
'https://www.dropbox.com/scl/fi/dima81xb776o6r9rmvxf6/2012_archive.zip.001?rlkey=wlyma7xg70hllk0wutx4boqif&st=301p8dq2&dl=1',
|
122
|
-
'https://www.dropbox.com/scl/fi/b7h7a3b83c7pkx1ayz5tx/2012_archive.zip.002?rlkey=380e4viezrorkbdgs16j9qyig&st=35f8jjt9&dl=1'
|
123
|
-
]},
|
124
|
-
{'year': 2013, 'urls': [
|
125
|
-
'https://www.dropbox.com/scl/fi/5z0rubg54kgt60sp3w8ir/2013_archive.zip.001?rlkey=9b1ff6vw6v76g9p6n20z0pf1y&st=p2kouaw2&dl=1',
|
126
|
-
'https://www.dropbox.com/scl/fi/g0n2vtrc3nsjou1t7zdv8/2013_archive.zip.002?rlkey=42id27sv2tzz4nt2lb999kjo0&st=teww7pk6&dl=1'
|
127
|
-
]},
|
128
|
-
{'year': 2014, 'urls': [
|
129
|
-
'https://www.dropbox.com/scl/fi/25kr0m6nfz1uvecpzsl3g/2014_archive.zip.001?rlkey=9b4v6eevhrsqx4yxr4syl3xx9&st=crdzt5e5&dl=1',
|
130
|
-
'https://www.dropbox.com/scl/fi/k0dt79eyjuvhxvrdepsat/2014_archive.zip.002?rlkey=u44c7wysi21tpvo7p2emspr96&st=gdzb6vbl&dl=1'
|
131
|
-
]},
|
132
|
-
{'year': 2015, 'urls': [
|
133
|
-
'https://www.dropbox.com/scl/fi/jw3a4ua6qgy439jm5guwb/2015_archive.zip.001?rlkey=vsah3muoz6po9iwgfmy6idax4&st=8np4xe5t&dl=1',
|
134
|
-
'https://www.dropbox.com/scl/fi/kmk5p3ynpf4e4n1zu4ead/2015_archive.zip.002?rlkey=v5z3sli6unlqomdlgq2vsfmyy&st=684ulwyp&dl=1'
|
135
|
-
]},
|
136
|
-
{'year': 2016, 'urls': [
|
137
|
-
'https://www.dropbox.com/scl/fi/veo77wy3muzg7jua1pnon/2016_archive.zip.001?rlkey=xlh62swhnywcruck89ix7zsnv&st=p1u5mrql&dl=1',
|
138
|
-
'https://www.dropbox.com/scl/fi/nf4ue014vnf8i5wd3ifq8/2016_archive.zip.002?rlkey=kpnh9hmw7bonbjj3a1qtmx3wr&st=2o1ljgk3&dl=1'
|
139
|
-
]},
|
140
|
-
{'year': 2017, 'urls': [
|
141
|
-
'https://www.dropbox.com/scl/fi/ma6kdn0zmr0jsfjuwyrr8/2017_archive.zip.001?rlkey=cmcrs84513amzd0xtnhgowjig&st=2y20plzl&dl=1',
|
142
|
-
'https://www.dropbox.com/scl/fi/7pqfkoalf6kwdxglkd4rd/2017_archive.zip.002?rlkey=pu9gpwj8s58jpxaa5bo4qdt2t&st=7cjfuewb&dl=1'
|
143
|
-
]},
|
144
|
-
{'year': 2018, 'urls': [
|
145
|
-
'https://www.dropbox.com/scl/fi/76smlo78ilea1h1x5ej5p/2018_archive.zip.001?rlkey=9s7ccdm0il6nash54x7lpzlyq&st=nugdjlct&dl=1',
|
146
|
-
'https://www.dropbox.com/scl/fi/ewdm0f8bztpq9290c0bzk/2018_archive.zip.002?rlkey=6baqb8j9ptu17f3r6xvlceuot&st=faj7cbyf&dl=1'
|
147
|
-
]},
|
148
|
-
{'year': 2019, 'urls': [
|
149
|
-
'https://www.dropbox.com/scl/fi/9uk4a45vvpda567sonboo/2019_archive.zip.001?rlkey=v0me7vf0lamwue2g936sdduo8&st=30ehpju3&dl=1',
|
150
|
-
'https://www.dropbox.com/scl/fi/7uzovuhycbi8gt2fb84jk/2019_archive.zip.002?rlkey=vckqm3ekb7xcmd0m8whfzvmsv&st=yyxsxzhc&dl=1'
|
151
|
-
]},
|
152
|
-
{'year': 2020, 'urls': [
|
153
|
-
'https://www.dropbox.com/scl/fi/85aiiz3kun6r8zetjgjgw/2020_archive.zip.001?rlkey=3z55z1kvkgd7vjlit69v3peu4&st=6bqx7i9f&dl=1',
|
154
|
-
'https://www.dropbox.com/scl/fi/gc5lt1cocx4fukcx5wmpi/2020_archive.zip.002?rlkey=kpwpswwy5za0d7xspgqu3yq1r&st=8do0y1so&dl=1',
|
155
|
-
'https://www.dropbox.com/scl/fi/1zkkim7118qqhy2ktordl/2020_archive.zip.003?rlkey=jryn61lym4x5vf6z7t9uqidt7&st=mpl7uu8e&dl=1'
|
156
|
-
]},
|
157
|
-
{'year': 2021, 'urls': [
|
158
|
-
'https://www.dropbox.com/scl/fi/kraiuj98f1at7pepcfdbl/2021_archive.zip.001?rlkey=7x1ppre2o05cdypmsq1quv9so&st=rqqq3skc&dl=1',
|
159
|
-
'https://www.dropbox.com/scl/fi/s45tc1e97384ov73zcrrm/2021_archive.zip.002?rlkey=t7c6was2nt5v73bjmyyknma4g&st=ts1esu9j&dl=1',
|
160
|
-
'https://www.dropbox.com/scl/fi/se0b1a66rct9ludn5nx8p/2021_archive.zip.003?rlkey=m6e579metkdyg8hmhgouuyxug&st=z0hqvdcw&dl=1'
|
161
|
-
]},
|
162
|
-
{'year': 2022, 'urls': [
|
163
|
-
'https://www.dropbox.com/scl/fi/2iz7url6znpchw55ufduw/2022_archive.zip.001?rlkey=d3b4topzrj6qd2ag9ui8tbxuv&st=id8ybmcg&dl=1',
|
164
|
-
'https://www.dropbox.com/scl/fi/ia6y75uwuap2eo3cljqz6/2022_archive.zip.002?rlkey=hzksfpslqms6khimhz4pwyzuv&st=d05v5oqh&dl=1',
|
165
|
-
'https://www.dropbox.com/scl/fi/q0y77ektba0kkyfd86x9f/2022_archive.zip.003?rlkey=imo5k84n0oq9xzlnd3qi4hsxx&st=38ezt7hx&dl=1'
|
166
|
-
]},
|
167
|
-
{'year': 2023, 'urls': [
|
168
|
-
'https://www.dropbox.com/scl/fi/lsrpoatfkdpk9hhc3noqy/2023_archive.zip.001?rlkey=o76y41tm7fbbd87b3m9papbqg&st=ucq14or6&dl=1',
|
169
|
-
'https://www.dropbox.com/scl/fi/dldxu8a3uzk69fzp33gfi/2023_archive.zip.002?rlkey=gi4sj8ol2x6s7hnk36rp9jh4r&st=wqrhw4rn&dl=1',
|
170
|
-
'https://www.dropbox.com/scl/fi/u51gbwg5moz4qgoyvcb62/2023_archive.zip.003?rlkey=vl3h41up8k049pr8lglbwh8hh&st=extn710g&dl=1'
|
171
|
-
]},
|
172
|
-
{'year': 2024, 'urls': [
|
173
|
-
'https://www.dropbox.com/scl/fi/1exc08zvgz2pgcp3w3riy/2024_archive.zip.001?rlkey=1r6k5r9kcyske314tp1qitfua&st=eo1elt94&dl=1',
|
174
|
-
'https://www.dropbox.com/scl/fi/u9d0e61euy3p1aq7nkmka/2024_archive.zip.002?rlkey=vwijac5pfwbeyxg7lr5m4f3lx&st=aedjdb8u&dl=1'
|
175
|
-
]}
|
176
|
-
]
|
177
|
-
|
178
|
-
dataset_10k_record_list = [{'year':2001,'record':'13871721'},
|
179
|
-
{'year':2002,'record':'13871779'},
|
180
|
-
{'year':2003,'record':'13871791'},
|
181
|
-
{'year':2004,'record':'13871811'},
|
182
|
-
{'year':2005,'record':'13871828'},
|
183
|
-
{'year':2006,'record':'13871965'},
|
184
|
-
{'year':2007,'record':'13872346'},
|
185
|
-
{'year':2008,'record':'13872366'},
|
186
|
-
{'year':2009,'record':'13872372'},
|
187
|
-
{'year':2010,'record':'13872374'},
|
188
|
-
{'year':2011,'record':'13872380'},
|
189
|
-
{'year':2012,'record':'13872400'},
|
190
|
-
{'year':2013,'record':'13872494'},
|
191
|
-
{'year':2014,'record':'13872496'},
|
192
|
-
{'year':2015,'record':'13872511'},
|
193
|
-
{'year':2016,'record':'13872528'},
|
194
|
-
{'year':2017,'record':'13872585'},
|
195
|
-
{'year':2018,'record':'13872601'},
|
196
|
-
{'year':2019,'record':'13872609'},
|
197
|
-
{'year':2020,'record':'13872611'},
|
198
|
-
{'year':2021,'record':'13872655'},
|
199
|
-
{'year':2022,'record':'13872647'},
|
200
|
-
{'year':2023,'record':'13872783'},
|
201
|
-
{'year':2024,'record':'13872663'}
|
202
|
-
]
|
datamule/parser/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
from .sgml_parser import parse_submission
|
@@ -1,82 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
import re
|
3
|
-
from .helper import load_file_content, clean_title
|
4
|
-
|
5
|
-
PART_PATTERN = re.compile(r'\n\s*part[.:)?\s]+([IVX]+|\d+)', re.I)
|
6
|
-
ITEM_PATTERN = re.compile(r'\n\s*item[.:)?\s]+(\d+[A-Z]?)', re.I)
|
7
|
-
IS_10K_PATTERN = re.compile(r'item[.:)?\s]+14', re.I)
|
8
|
-
TOC_END_PATTERN = re.compile(r'(?:item[.:)?\s]+14).*?(?=\n\s*item[.:)?\s]+1\b)', re.I | re.DOTALL)
|
9
|
-
|
10
|
-
ROMAN_TO_NUM = {'I': '1', 'II': '2', 'III': '3', 'IV': '4'}
|
11
|
-
|
12
|
-
ITEM_TO_PART = {
|
13
|
-
'1': 'I', '1A': 'I', '1B': 'I', '1C': 'I', '2': 'I', '3': 'I', '4': 'I',
|
14
|
-
'5': 'II', '6': 'II', '7': 'II', '7A': 'II', '8': 'II', '9': 'II', '9A': 'II', '9B': 'II', '9C': 'II',
|
15
|
-
'10': 'III', '11': 'III', '12': 'III', '13': 'III', '14': 'III',
|
16
|
-
'15': 'IV', '16': 'IV', '16A': 'IV'
|
17
|
-
}
|
18
|
-
|
19
|
-
def find_content_start(content):
|
20
|
-
toc_match = TOC_END_PATTERN.search(content)
|
21
|
-
if toc_match:
|
22
|
-
item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
|
23
|
-
item_1_match = item_1_pattern.search(content, toc_match.end())
|
24
|
-
if item_1_match:
|
25
|
-
return item_1_match.start()
|
26
|
-
return 0
|
27
|
-
|
28
|
-
def find_anchors(content):
|
29
|
-
start_pos = find_content_start(content)
|
30
|
-
content = '\n' + content[start_pos:]
|
31
|
-
|
32
|
-
anchors = []
|
33
|
-
for part_match in PART_PATTERN.finditer(content):
|
34
|
-
anchors.append(('part', part_match.group(1), part_match.start() + start_pos, part_match.group()))
|
35
|
-
|
36
|
-
for item_match in ITEM_PATTERN.finditer(content):
|
37
|
-
anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
|
38
|
-
|
39
|
-
return sorted(anchors, key=lambda x: x[2])
|
40
|
-
|
41
|
-
def extract_sections(content, anchors, filename):
|
42
|
-
if not anchors:
|
43
|
-
return {}
|
44
|
-
|
45
|
-
result = {
|
46
|
-
"metadata": {"document_name": Path(filename).stem},
|
47
|
-
"document": {
|
48
|
-
"part1": {}, "part2": {}, "part3": {}, "part4": {}
|
49
|
-
}
|
50
|
-
}
|
51
|
-
|
52
|
-
last_item = None
|
53
|
-
current_text = None
|
54
|
-
|
55
|
-
for i, current in enumerate(anchors):
|
56
|
-
if current[0] == 'item':
|
57
|
-
next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
|
58
|
-
text = content[current[2]:next_pos].strip()
|
59
|
-
|
60
|
-
if current[1] == last_item:
|
61
|
-
current_text += "\n\n" + text
|
62
|
-
else:
|
63
|
-
if last_item and last_item in ITEM_TO_PART:
|
64
|
-
part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
|
65
|
-
result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
|
66
|
-
current_text = text
|
67
|
-
last_item = current[1]
|
68
|
-
|
69
|
-
if last_item and last_item in ITEM_TO_PART:
|
70
|
-
part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
|
71
|
-
result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
|
72
|
-
|
73
|
-
# Only keep non-empty parts
|
74
|
-
result["document"] = {k:v for k,v in result["document"].items() if v}
|
75
|
-
return result
|
76
|
-
|
77
|
-
def parse_10k(filename):
|
78
|
-
content = load_file_content(filename)
|
79
|
-
if not IS_10K_PATTERN.search(content):
|
80
|
-
return {}
|
81
|
-
anchors = find_anchors(content)
|
82
|
-
return extract_sections(content, anchors, filename)
|
@@ -1,73 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
from .helper import load_file_content, clean_title
|
3
|
-
import re
|
4
|
-
|
5
|
-
PART_II_PATTERN = re.compile(r'\n\s*part\s+II\.?(?:[:\s\.]|$)', re.I)
|
6
|
-
ITEM_PATTERN = re.compile(r'\n\s*item\s+(\d+[A-Z]?)\.?(?:[:\s\.]|$)', re.I)
|
7
|
-
TOC_END_PATTERN = re.compile(r'(?:item\s*6\.?).*?(?=\n\s*item\s*1\.?\b)', re.I | re.DOTALL)
|
8
|
-
|
9
|
-
def find_content_start(content):
|
10
|
-
toc_match = TOC_END_PATTERN.search(content)
|
11
|
-
if toc_match:
|
12
|
-
item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
|
13
|
-
item_1_match = item_1_pattern.search(content, toc_match.end())
|
14
|
-
if item_1_match:
|
15
|
-
return item_1_match.start()
|
16
|
-
return 0
|
17
|
-
|
18
|
-
def find_anchors(content):
|
19
|
-
start_pos = find_content_start(content)
|
20
|
-
content = '\n' + content[start_pos:]
|
21
|
-
|
22
|
-
part_ii_match = PART_II_PATTERN.search(content)
|
23
|
-
part_ii_pos = part_ii_match.start() + start_pos if part_ii_match else None
|
24
|
-
|
25
|
-
anchors = []
|
26
|
-
for item_match in ITEM_PATTERN.finditer(content):
|
27
|
-
anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
|
28
|
-
|
29
|
-
return sorted(anchors, key=lambda x: x[2]), part_ii_pos
|
30
|
-
|
31
|
-
def extract_sections(content, anchors_and_part2, filename):
|
32
|
-
anchors, part2_pos = anchors_and_part2
|
33
|
-
if not anchors:
|
34
|
-
return {}
|
35
|
-
|
36
|
-
result = {
|
37
|
-
"metadata": {"document_name": Path(filename).stem},
|
38
|
-
"document": {
|
39
|
-
"part1": {},
|
40
|
-
"part2": {}
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
last_item = None
|
45
|
-
current_text = None
|
46
|
-
last_pos = None
|
47
|
-
|
48
|
-
for i, current in enumerate(anchors):
|
49
|
-
next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
|
50
|
-
|
51
|
-
if current[1] == last_item:
|
52
|
-
current_text += "\n\n" + content[current[2]:next_pos].strip()
|
53
|
-
else:
|
54
|
-
if last_item is not None:
|
55
|
-
part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
|
56
|
-
result["document"][part_key][f"item{last_item.lower()}"] = current_text
|
57
|
-
|
58
|
-
current_text = content[current[2]:next_pos].strip()
|
59
|
-
last_item = current[1]
|
60
|
-
last_pos = current[2]
|
61
|
-
|
62
|
-
if last_item is not None:
|
63
|
-
part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
|
64
|
-
result["document"][part_key][f"item{last_item.lower()}"] = current_text
|
65
|
-
|
66
|
-
# Clean empty parts
|
67
|
-
result["document"] = {k:v for k,v in result["document"].items() if v}
|
68
|
-
return result
|
69
|
-
|
70
|
-
def parse_10q(filename):
|
71
|
-
content = load_file_content(filename)
|
72
|
-
anchors_and_part2 = find_anchors(content)
|
73
|
-
return extract_sections(content, anchors_and_part2, filename)
|