datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.380.dist-info/METADATA +0 -110
  72. datamule-0.380.dist-info/RECORD +0 -61
  73. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,256 +0,0 @@
1
-
2
- # AI generated slop. Quick workaround to get mulebot server artifact to build correctly. Will rewrite later.
3
- import re
4
-
5
- def create_valid_id(title):
6
- # Remove any characters that are not alphanumeric, hyphen, underscore, colon, or period
7
- valid_id = re.sub(r'[^\w\-.:]+', '-', title)
8
- # Ensure the id starts with a letter
9
- if not valid_id[0].isalpha():
10
- valid_id = 'section-' + valid_id
11
- # Convert to lowercase
12
- return valid_id.lower()
13
-
14
- def create_content(content, level=1):
15
- html = ""
16
- for index, item in enumerate(content):
17
- if 'title' in item:
18
- section_id = create_valid_id(item['title'])
19
- else:
20
- section_id = f'section-{level}-{index}'
21
-
22
- html += f'<div class="section level-{level}" id="{section_id}">'
23
-
24
- if 'title' in item:
25
- html += f'<h3 class="section-title">{item["title"]}</h3>'
26
- if 'text' in item:
27
- html += f'<p class="section-text">{item["text"]}</p>'
28
-
29
- if 'content' in item:
30
- html += '<div class="sub-content">'
31
- html += create_content(item['content'], level + 1)
32
- html += '</div>'
33
-
34
- html += '</div>'
35
-
36
- return html
37
-
38
- def json_to_html(data):
39
- html = '<div class="dashboard-container">'
40
-
41
- # Sidebar
42
- html += '''
43
- <div class="sidebar" id="sidebar">
44
- <h2 class="sidebar-title">Sections</h2>
45
- <ul id="section-list" class="nav flex-column"></ul>
46
- </div>
47
- '''
48
-
49
- # Main content
50
- html += '<div class="main-content">'
51
-
52
- # Header
53
- html += f'''
54
- <header class="dashboard-header">
55
- <h1 class="dashboard-title">Filing Viewer</h1>
56
- <p class="dashboard-subtitle">CIK: {data['cik']} | Accession Number: {data['accession_number']}</p>
57
- </header>
58
- '''
59
-
60
- # Document content
61
- for doc in data['document']:
62
- if 'content' in doc:
63
- html += create_content(doc['content'])
64
-
65
- html += '</div></div>'
66
-
67
- return html
68
-
69
- def create_interactive_filing(json_data):
70
- html = f'''
71
- <!DOCTYPE html>
72
- <html lang="en">
73
- <head>
74
- <meta charset="utf-8">
75
- <meta name="viewport" content="width=device-width, initial-scale=1">
76
- <title>SEC 10-K Premium Executive Dashboard</title>
77
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.0/css/bootstrap.min.css">
78
- <style>
79
- :root {{
80
- --primary-color: #333;
81
- --secondary-color: #666;
82
- --background-color: #f8f8f8;
83
- --text-color: #333;
84
- --border-color: #e0e0e0;
85
- }}
86
- body {{
87
- font-family: 'Arial', sans-serif;
88
- background-color: var(--background-color);
89
- color: var(--text-color);
90
- line-height: 1.6;
91
- }}
92
- .dashboard-container {{
93
- display: flex;
94
- min-height: 100vh;
95
- }}
96
- .sidebar {{
97
- width: 250px;
98
- background-color: white;
99
- border-right: 1px solid var(--border-color);
100
- padding: 20px;
101
- position: fixed;
102
- height: 100vh;
103
- overflow-y: auto;
104
- transition: transform 0.3s ease-in-out;
105
- }}
106
- .main-content {{
107
- flex-grow: 1;
108
- margin-left: 250px;
109
- padding: 40px;
110
- }}
111
- .dashboard-header {{
112
- margin-bottom: 40px;
113
- }}
114
- .dashboard-title {{
115
- color: var(--primary-color);
116
- font-weight: bold;
117
- font-size: 2.5rem;
118
- margin-bottom: 10px;
119
- }}
120
- .dashboard-subtitle {{
121
- color: var(--secondary-color);
122
- font-size: 1rem;
123
- }}
124
- .section {{
125
- margin-bottom: 30px;
126
- padding-left: 20px;
127
- border-left: 2px solid var(--border-color);
128
- }}
129
- .section-title {{
130
- color: var(--primary-color);
131
- font-size: 1.5rem;
132
- margin-bottom: 15px;
133
- }}
134
- .level-2 {{ margin-left: 20px; }}
135
- .level-3 {{ margin-left: 40px; }}
136
- .nav-link {{
137
- color: var(--text-color);
138
- transition: all 0.3s ease;
139
- padding: 5px 10px;
140
- margin-bottom: 5px;
141
- border-radius: 4px;
142
- }}
143
- .nav-link:hover, .nav-link.active {{
144
- background-color: var(--background-color);
145
- color: var(--primary-color);
146
- }}
147
- .sidebar-title {{
148
- font-size: 1.2rem;
149
- color: var(--primary-color);
150
- margin-bottom: 20px;
151
- padding-bottom: 10px;
152
- border-bottom: 1px solid var(--border-color);
153
- }}
154
- .toggle-sidebar {{
155
- display: none;
156
- position: fixed;
157
- top: 10px;
158
- left: 10px;
159
- z-index: 1000;
160
- background-color: var(--primary-color);
161
- color: white;
162
- border: none;
163
- padding: 10px;
164
- border-radius: 5px;
165
- }}
166
- @media (max-width: 768px) {{
167
- .sidebar {{
168
- transform: translateX(-100%);
169
- z-index: 1000;
170
- }}
171
- .sidebar.active {{
172
- transform: translateX(0);
173
- }}
174
- .main-content {{
175
- margin-left: 0;
176
- padding: 20px;
177
- }}
178
- .toggle-sidebar {{
179
- display: block;
180
- }}
181
- .dashboard-title {{
182
- font-size: 2rem;
183
- }}
184
- .section {{
185
- padding-left: 10px;
186
- }}
187
- .level-2, .level-3 {{
188
- margin-left: 10px;
189
- }}
190
- }}
191
- </style>
192
- </head>
193
- <body>
194
- <button class="toggle-sidebar" id="toggleSidebar">☰</button>
195
- {json_to_html(json_data)}
196
- <script>
197
- document.addEventListener('DOMContentLoaded', (event) => {{
198
- const sidebar = document.getElementById('sidebar');
199
- const toggleSidebar = document.getElementById('toggleSidebar');
200
- const sections = document.querySelectorAll('.section');
201
- const sectionList = document.getElementById('section-list');
202
-
203
- toggleSidebar.addEventListener('click', () => {{
204
- sidebar.classList.toggle('active');
205
- }});
206
-
207
- sections.forEach((section, index) => {{
208
- const title = section.querySelector('.section-title');
209
- if (title) {{
210
- const listItem = document.createElement('li');
211
- const link = document.createElement('a');
212
- link.href = `#${{section.id}}`;
213
- link.className = 'nav-link';
214
- link.textContent = title.textContent;
215
- listItem.appendChild(link);
216
- sectionList.appendChild(listItem);
217
-
218
- link.addEventListener('click', (e) => {{
219
- e.preventDefault();
220
- section.scrollIntoView({{behavior: 'smooth'}});
221
- if (window.innerWidth <= 768) {{
222
- sidebar.classList.remove('active');
223
- }}
224
- }});
225
- }}
226
- }});
227
-
228
- const observerOptions = {{
229
- root: null,
230
- rootMargin: '0px',
231
- threshold: 0.5
232
- }};
233
-
234
- const observer = new IntersectionObserver((entries) => {{
235
- entries.forEach(entry => {{
236
- if (entry.isIntersecting) {{
237
- const id = entry.target.id;
238
- document.querySelectorAll('.nav-link').forEach(navLink => {{
239
- navLink.classList.remove('active');
240
- if (navLink.getAttribute('href') === `#${{id}}`) {{
241
- navLink.classList.add('active');
242
- }}
243
- }});
244
- }}
245
- }});
246
- }}, observerOptions);
247
-
248
- sections.forEach(section => {{
249
- observer.observe(section);
250
- }});
251
- }});
252
- </script>
253
- </body>
254
- </html>
255
- '''
256
- return html
datamule/global_vars.py DELETED
@@ -1,202 +0,0 @@
1
- headers = {
2
- "User-Agent": "Peter Smith petersmith@gmail.com" # Replace with your information
3
- }
4
-
5
- dataset_10k_url_list = [
6
- {'year': 2024, 'urls': [
7
- 'https://www.dropbox.com/scl/fi/3gd9whn8qtychbxuxnbsa/2024_archive.zip.001?rlkey=2n8qwhcccevniqkvy39ksa467&st=hn3kacs6&dl=1',
8
- 'https://www.dropbox.com/scl/fi/8citjlh4h58speyag3hd9/2024_archive.zip.002?rlkey=ymadt6wc81e9m3a15znwum7s1&st=opzcpxye&dl=1'
9
- ]},
10
- {'year': 2023, 'urls': [
11
- 'https://www.dropbox.com/scl/fi/hdnb6bbr7l3xgrfmc73ht/2023_archive.zip.001?rlkey=kd0npzwvscacfdz0syq2irnu7&st=nzmh3lwr&dl=1',
12
- 'https://www.dropbox.com/scl/fi/ubiyq3tssa95enbb8xi9u/2023_archive.zip.002?rlkey=xkef3tx3q5a4f3oh38tx4cjy4&st=z3nrs8g3&dl=1'
13
- ]},
14
- {'year': 2022, 'urls': [
15
- 'https://www.dropbox.com/scl/fi/rlhvogepk9cpnohhq4gs7/2022_archive.zip.001?rlkey=81hmjgdt1rtjub64wrlp9oy5t&st=i6ecnbux&dl=1',
16
- 'https://www.dropbox.com/scl/fi/r5m6y1j8uf02uy61u3fcn/2022_archive.zip.002?rlkey=z80qlgjifbtf5mjuqlu98478p&st=7wqvhekh&dl=1'
17
- ]},
18
- {'year': 2021, 'urls': [
19
- 'https://www.dropbox.com/scl/fi/wemvdqxsqddlhlcgon36g/2021_archive.zip.001?rlkey=tjl3525vn60zwosnqdgznecj5&st=66bycsgf&dl=1',
20
- 'https://www.dropbox.com/scl/fi/si0nynzxxf31kxpxobzrf/2021_archive.zip.002?rlkey=93oczu6hs5iusex2f65k2mxc7&st=x8cymp6w&dl=1'
21
- ]},
22
- {'year': 2020, 'urls': [
23
- 'https://www.dropbox.com/scl/fi/vxvgwrw2q04qlj5m2aoog/2020_archive.zip.001?rlkey=88h3x78axn5ghvk9t5otqpdjd&st=72xwi1y1&dl=1',
24
- 'https://www.dropbox.com/scl/fi/9blysoqztxg5vedrf2l2i/2020_archive.zip.002?rlkey=msvos1omcb8fowb4q1nm38m6e&st=bscfunry&dl=1'
25
- ]},
26
- {'year': 2019, 'urls': [
27
- 'https://www.dropbox.com/scl/fi/hq5o9zo8xrqmd7l4o06hy/2019_archive.zip.001?rlkey=sazeziru87k7qptqhxenv0d6m&st=241jmwwd&dl=1',
28
- 'https://www.dropbox.com/scl/fi/2jyxw65unxhhsk5fuhuon/2019_archive.zip.002?rlkey=nzyf1em08qgxdhpz2vuoj417u&st=ii9zpdxi&dl=1'
29
- ]},
30
- {'year': 2018, 'urls': [
31
- 'https://www.dropbox.com/scl/fi/c1vexzflxr6qcsg25nxp7/2018_archive.zip.001?rlkey=hnb5zeashbtqfhxsnf9vt94vv&st=wy9i633f&dl=1',
32
- 'https://www.dropbox.com/scl/fi/yzt3464lscpmy5n39olk5/2018_archive.zip.002?rlkey=tu3lbnjnd1xwni8f6nfpbmtgm&st=c0zur5sz&dl=1'
33
- ]},
34
- {'year': 2017, 'urls': [
35
- 'https://www.dropbox.com/scl/fi/3trjwjx6v64ilnt8nyp02/2017_archive.zip.001?rlkey=vl4x1rrp0fisjy3djrraayjoe&st=ept0d24k&dl=1',
36
- 'https://www.dropbox.com/scl/fi/p011jrntmkrmlb9u84k62/2017_archive.zip.002?rlkey=55uka4y2d90eb5d8lgu86yl6c&st=ildtcc94&dl=1'
37
- ]},
38
- {'year': 2016, 'urls': [
39
- 'https://www.dropbox.com/scl/fi/5oydfbume2mxqfobn2e9r/2016_archive.zip.001?rlkey=4h76gl9ny8e7vgcdnphf7bzn9&st=jkr0ioby&dl=1',
40
- 'https://www.dropbox.com/scl/fi/faofea4f2mkzjslt12s0b/2016_archive.zip.002?rlkey=bolnuqm3fq7yrfqhf5ek92dgp&st=33w8ivrx&dl=1'
41
- ]},
42
- {'year': 2015, 'urls': [
43
- 'https://www.dropbox.com/scl/fi/75rdrrsrgbg95qcedcr65/2015_archive.zip.001?rlkey=pb4ec6sda3ii0lnzua4enxnr3&st=t7wkjb60&dl=1',
44
- 'https://www.dropbox.com/scl/fi/ixfttx508tp8cuf3xismr/2015_archive.zip.002?rlkey=xcoqtcx3vjnh3ctxhpqe4jv2j&st=56fgbb8w&dl=1'
45
- ]},
46
- {'year': 2014, 'urls': [
47
- 'https://www.dropbox.com/scl/fi/1y1j6ct6mox76euu38t2c/2014_archive.zip.001?rlkey=hwh83ttl3nahb1oegib05p3k7&st=d01umhdp&dl=1',
48
- 'https://www.dropbox.com/scl/fi/bh2yu3coqcshj5mybk3wd/2014_archive.zip.002?rlkey=0g4ftzhytyn3vk8kgwu72b6lf&st=jz9pzdoy&dl=1'
49
- ]},
50
- {'year': 2013, 'urls': [
51
- 'https://www.dropbox.com/scl/fi/jraed38u18c9y16mwcnmo/2013_archive.zip.001?rlkey=fvy6flk8uxk2mn5wjvynu96ag&st=3sivwbx7&dl=1',
52
- 'https://www.dropbox.com/scl/fi/cgi8opfbnu727seazzmvd/2013_archive.zip.002?rlkey=sm7h7wfzud22u3ed1pw8fr7u9&st=19tunve8&dl=1'
53
- ]},
54
- {'year': 2012, 'urls': [
55
- 'https://www.dropbox.com/scl/fi/hji2bb1ce2wdwf5yc6dyf/2012_archive.zip.001?rlkey=0r53m8roo6e8grqez3lnhpayk&st=1jx5jq4r&dl=1',
56
- 'https://www.dropbox.com/scl/fi/hqoh4l305b168619eytkj/2012_archive.zip.002?rlkey=2laeldqzlwskwoha9idmioolf&st=1w8zowyp&dl=1'
57
- ]},
58
- {'year': 2011, 'urls': [
59
- 'https://www.dropbox.com/scl/fi/z7z8qnmf73hqr33b386zu/2011_archive.zip.001?rlkey=kdkd3urxmo830n30gwiapqvkz&st=2hsuxpcm&dl=1',
60
- 'https://www.dropbox.com/scl/fi/illd2qfsj2vuy4yjd13el/2011_archive.zip.002?rlkey=oewcg57c92wlbufwhon21mjeq&st=ir05xure&dl=1'
61
- ]},
62
- {'year': 2010, 'urls': [
63
- 'https://www.dropbox.com/scl/fi/j41ta06g0fso473x4oa1f/2010_archive.zip.001?rlkey=1r83ibenn06fxs6zhm6oi46pr&st=iia9qtid&dl=1',
64
- 'https://www.dropbox.com/scl/fi/31b6huoywrrc44b76wm1w/2010_archive.zip.002?rlkey=40jfl7zqnw5sikgd4wuo1095m&st=igy563mu&dl=1'
65
- ]},
66
- {'year': 2009, 'urls': [
67
- 'https://www.dropbox.com/scl/fi/4y6c1icwvkjwwqbgx4w1a/2009_archive.zip.001?rlkey=3qqp4ikinplktw6g39x68rdmj&st=dcg0eik1&dl=1',
68
- 'https://www.dropbox.com/scl/fi/jq808ah0j0vg1sqdmvnnm/2009_archive.zip.002?rlkey=wbgtvj8fkpgmcj5oxpj88jfog&st=4txza2fu&dl=1'
69
- ]},
70
- {'year': 2008, 'urls': [
71
- 'https://www.dropbox.com/scl/fi/uf1ym44ns1936uj8vqfwk/2008_archive.zip.001?rlkey=9rivaprk7yjrutdfu7sqo3jxv&st=gj75o1ne&dl=1'
72
- ]},
73
- {'year': 2007, 'urls': [
74
- 'https://www.dropbox.com/scl/fi/58qmwj7m3rrl7kr00lwat/2007_archive.zip.001?rlkey=815zw0gnb7gowcdf0iuvtctqh&st=7cajp3ii&dl=1'
75
- ]},
76
- {'year': 2006, 'urls': [
77
- 'https://www.dropbox.com/scl/fi/gxi8qzpz53f9qcvn2hpl5/2006_archive.zip.001?rlkey=hmehz1azpbxzpw6j1wy5eppq0&st=czxrdcnh&dl=1'
78
- ]},
79
- {'year': 2005, 'urls': [
80
- 'https://www.dropbox.com/scl/fi/tjstefvwfzs3p0a1vzlbg/2005_archive.zip.001?rlkey=825m16ziekd9mwc3ybvjvisj0&st=4dyy121i&dl=1'
81
- ]},
82
- {'year': 2004, 'urls': [
83
- 'https://www.dropbox.com/scl/fi/2g25emvme8gqoxnv5fhla/2004_archive.zip.001?rlkey=lz8oyniqgc7xvn343d39600ic&st=yvmi9h1t&dl=1'
84
- ]},
85
- {'year': 2003, 'urls': [
86
- 'https://www.dropbox.com/scl/fi/d2ub0o4sqo0b2evd9s9z3/2003_archive.zip.001?rlkey=qxsltqjfxnk0xrp0qx5c49v57&st=cc4dzjo2&dl=1'
87
- ]},
88
- {'year': 2002, 'urls': [
89
- 'https://www.dropbox.com/scl/fi/9xy6y09y2b5zp4w4c7cty/2002_archive.zip.001?rlkey=c4qqact06zz7ykmfc1n5odf2a&st=d93mptu8&dl=1'
90
- ]},
91
- {'year': 2001, 'urls': [
92
- 'https://www.dropbox.com/scl/fi/b3miae1kvths87e0cq8fs/2001_archive.zip.001?rlkey=g21mnbzicju3czney275bpjyp&st=0nji6q8l&dl=1'
93
- ]}
94
- ]
95
-
96
- dataset_10q_url_list = [
97
- {'year': 2001, 'urls': ['https://www.dropbox.com/scl/fi/1bzig8wabbtezfg6dipou/2001_archive.zip.001?rlkey=4sddwex8k9kd4jahypsxgvbs2&st=s5x7wnh5&dl=1']},
98
- {'year': 2002, 'urls': ['https://www.dropbox.com/scl/fi/qe0om30w0dhtg3byse8n1/2002_archive.zip.001?rlkey=7ji21x7ppavstwoe1dz7028r5&st=hb5dkzo2&dl=1']},
99
- {'year': 2003, 'urls': ['https://www.dropbox.com/scl/fi/9dfpof4es1kfdttpejkb1/2003_archive.zip.001?rlkey=36xwaacvtb3rw8mqkd1dwgav2&st=vfmo1tph&dl=1']},
100
- {'year': 2004, 'urls': ['https://www.dropbox.com/scl/fi/l2tv1ywmltlx2ygbmy1k4/2004_archive.zip.001?rlkey=sdrczfb9irv9q2xi10a2y169j&st=6a1v0pos&dl=1']},
101
- {'year': 2005, 'urls': ['https://www.dropbox.com/scl/fi/16euq6ies55c0q4z2ws0q/2005_archive.zip.001?rlkey=l80ig2irksajd7djmlv6bith0&st=yc5ing9t&dl=1']},
102
- {'year': 2006, 'urls': ['https://www.dropbox.com/scl/fi/wi66433i8xdh3g6ozozod/2006_archive.zip.001?rlkey=zg09b09mdg77ni8zsq7p8dex9&st=347utf3x&dl=1']},
103
- {'year': 2007, 'urls': ['https://www.dropbox.com/scl/fi/u3bazimzkkps8qfvaubxm/2007_archive.zip.001?rlkey=fuj28imnb2bjskx2wggoyuvuu&st=76jka6tc&dl=1']},
104
- {'year': 2008, 'urls': [
105
- 'https://www.dropbox.com/scl/fi/htc6j3c9l17ey6urjzm63/2008_archive.zip.001?rlkey=9pnl5066d33x6wan8uqhvom5q&st=fsgogqya&dl=1',
106
- 'https://www.dropbox.com/scl/fi/47jb2sipfg13b5p6dzegb/2008_archive.zip.002?rlkey=ml6zfxrptg1jgebpd6bdmztej&st=3unkdpnr&dl=1'
107
- ]},
108
- {'year': 2009, 'urls': [
109
- 'https://www.dropbox.com/scl/fi/krrc6zx5cvbyhhskrrdlp/2009_archive.zip.001?rlkey=nggwp1z5ekrvgnxlg434vtfqg&st=mn6x1fqp&dl=1',
110
- 'https://www.dropbox.com/scl/fi/w46430sm52bd1bioc94f7/2009_archive.zip.002?rlkey=06ilznlorppqmhpj17wax1id9&st=ozloyl3v&dl=1'
111
- ]},
112
- {'year': 2010, 'urls': [
113
- 'https://www.dropbox.com/scl/fi/g86fzg6dysnt34raq352k/2010_archive.zip.001?rlkey=2kil6s78cj6p5bk8r0eptxygf&st=vyauu3sl&dl=1',
114
- 'https://www.dropbox.com/scl/fi/46ttnl8pb1qfk5icd7n4q/2010_archive.zip.002?rlkey=9nok0mg3mjexybywq1og6vdux&st=oia2g4fy&dl=1'
115
- ]},
116
- {'year': 2011, 'urls': [
117
- 'https://www.dropbox.com/scl/fi/ems2oygr0u3voq38yisuk/2011_archive.zip.001?rlkey=4lfsowv9o6wmkmozn3pdr80sh&st=3kn4ghbe&dl=1',
118
- 'https://www.dropbox.com/scl/fi/ihersfxuqnnli1fhze9wc/2011_archive.zip.002?rlkey=iirqvy919yv3pkvem2owsdgxy&st=0eb8rtbq&dl=1'
119
- ]},
120
- {'year': 2012, 'urls': [
121
- 'https://www.dropbox.com/scl/fi/dima81xb776o6r9rmvxf6/2012_archive.zip.001?rlkey=wlyma7xg70hllk0wutx4boqif&st=301p8dq2&dl=1',
122
- 'https://www.dropbox.com/scl/fi/b7h7a3b83c7pkx1ayz5tx/2012_archive.zip.002?rlkey=380e4viezrorkbdgs16j9qyig&st=35f8jjt9&dl=1'
123
- ]},
124
- {'year': 2013, 'urls': [
125
- 'https://www.dropbox.com/scl/fi/5z0rubg54kgt60sp3w8ir/2013_archive.zip.001?rlkey=9b1ff6vw6v76g9p6n20z0pf1y&st=p2kouaw2&dl=1',
126
- 'https://www.dropbox.com/scl/fi/g0n2vtrc3nsjou1t7zdv8/2013_archive.zip.002?rlkey=42id27sv2tzz4nt2lb999kjo0&st=teww7pk6&dl=1'
127
- ]},
128
- {'year': 2014, 'urls': [
129
- 'https://www.dropbox.com/scl/fi/25kr0m6nfz1uvecpzsl3g/2014_archive.zip.001?rlkey=9b4v6eevhrsqx4yxr4syl3xx9&st=crdzt5e5&dl=1',
130
- 'https://www.dropbox.com/scl/fi/k0dt79eyjuvhxvrdepsat/2014_archive.zip.002?rlkey=u44c7wysi21tpvo7p2emspr96&st=gdzb6vbl&dl=1'
131
- ]},
132
- {'year': 2015, 'urls': [
133
- 'https://www.dropbox.com/scl/fi/jw3a4ua6qgy439jm5guwb/2015_archive.zip.001?rlkey=vsah3muoz6po9iwgfmy6idax4&st=8np4xe5t&dl=1',
134
- 'https://www.dropbox.com/scl/fi/kmk5p3ynpf4e4n1zu4ead/2015_archive.zip.002?rlkey=v5z3sli6unlqomdlgq2vsfmyy&st=684ulwyp&dl=1'
135
- ]},
136
- {'year': 2016, 'urls': [
137
- 'https://www.dropbox.com/scl/fi/veo77wy3muzg7jua1pnon/2016_archive.zip.001?rlkey=xlh62swhnywcruck89ix7zsnv&st=p1u5mrql&dl=1',
138
- 'https://www.dropbox.com/scl/fi/nf4ue014vnf8i5wd3ifq8/2016_archive.zip.002?rlkey=kpnh9hmw7bonbjj3a1qtmx3wr&st=2o1ljgk3&dl=1'
139
- ]},
140
- {'year': 2017, 'urls': [
141
- 'https://www.dropbox.com/scl/fi/ma6kdn0zmr0jsfjuwyrr8/2017_archive.zip.001?rlkey=cmcrs84513amzd0xtnhgowjig&st=2y20plzl&dl=1',
142
- 'https://www.dropbox.com/scl/fi/7pqfkoalf6kwdxglkd4rd/2017_archive.zip.002?rlkey=pu9gpwj8s58jpxaa5bo4qdt2t&st=7cjfuewb&dl=1'
143
- ]},
144
- {'year': 2018, 'urls': [
145
- 'https://www.dropbox.com/scl/fi/76smlo78ilea1h1x5ej5p/2018_archive.zip.001?rlkey=9s7ccdm0il6nash54x7lpzlyq&st=nugdjlct&dl=1',
146
- 'https://www.dropbox.com/scl/fi/ewdm0f8bztpq9290c0bzk/2018_archive.zip.002?rlkey=6baqb8j9ptu17f3r6xvlceuot&st=faj7cbyf&dl=1'
147
- ]},
148
- {'year': 2019, 'urls': [
149
- 'https://www.dropbox.com/scl/fi/9uk4a45vvpda567sonboo/2019_archive.zip.001?rlkey=v0me7vf0lamwue2g936sdduo8&st=30ehpju3&dl=1',
150
- 'https://www.dropbox.com/scl/fi/7uzovuhycbi8gt2fb84jk/2019_archive.zip.002?rlkey=vckqm3ekb7xcmd0m8whfzvmsv&st=yyxsxzhc&dl=1'
151
- ]},
152
- {'year': 2020, 'urls': [
153
- 'https://www.dropbox.com/scl/fi/85aiiz3kun6r8zetjgjgw/2020_archive.zip.001?rlkey=3z55z1kvkgd7vjlit69v3peu4&st=6bqx7i9f&dl=1',
154
- 'https://www.dropbox.com/scl/fi/gc5lt1cocx4fukcx5wmpi/2020_archive.zip.002?rlkey=kpwpswwy5za0d7xspgqu3yq1r&st=8do0y1so&dl=1',
155
- 'https://www.dropbox.com/scl/fi/1zkkim7118qqhy2ktordl/2020_archive.zip.003?rlkey=jryn61lym4x5vf6z7t9uqidt7&st=mpl7uu8e&dl=1'
156
- ]},
157
- {'year': 2021, 'urls': [
158
- 'https://www.dropbox.com/scl/fi/kraiuj98f1at7pepcfdbl/2021_archive.zip.001?rlkey=7x1ppre2o05cdypmsq1quv9so&st=rqqq3skc&dl=1',
159
- 'https://www.dropbox.com/scl/fi/s45tc1e97384ov73zcrrm/2021_archive.zip.002?rlkey=t7c6was2nt5v73bjmyyknma4g&st=ts1esu9j&dl=1',
160
- 'https://www.dropbox.com/scl/fi/se0b1a66rct9ludn5nx8p/2021_archive.zip.003?rlkey=m6e579metkdyg8hmhgouuyxug&st=z0hqvdcw&dl=1'
161
- ]},
162
- {'year': 2022, 'urls': [
163
- 'https://www.dropbox.com/scl/fi/2iz7url6znpchw55ufduw/2022_archive.zip.001?rlkey=d3b4topzrj6qd2ag9ui8tbxuv&st=id8ybmcg&dl=1',
164
- 'https://www.dropbox.com/scl/fi/ia6y75uwuap2eo3cljqz6/2022_archive.zip.002?rlkey=hzksfpslqms6khimhz4pwyzuv&st=d05v5oqh&dl=1',
165
- 'https://www.dropbox.com/scl/fi/q0y77ektba0kkyfd86x9f/2022_archive.zip.003?rlkey=imo5k84n0oq9xzlnd3qi4hsxx&st=38ezt7hx&dl=1'
166
- ]},
167
- {'year': 2023, 'urls': [
168
- 'https://www.dropbox.com/scl/fi/lsrpoatfkdpk9hhc3noqy/2023_archive.zip.001?rlkey=o76y41tm7fbbd87b3m9papbqg&st=ucq14or6&dl=1',
169
- 'https://www.dropbox.com/scl/fi/dldxu8a3uzk69fzp33gfi/2023_archive.zip.002?rlkey=gi4sj8ol2x6s7hnk36rp9jh4r&st=wqrhw4rn&dl=1',
170
- 'https://www.dropbox.com/scl/fi/u51gbwg5moz4qgoyvcb62/2023_archive.zip.003?rlkey=vl3h41up8k049pr8lglbwh8hh&st=extn710g&dl=1'
171
- ]},
172
- {'year': 2024, 'urls': [
173
- 'https://www.dropbox.com/scl/fi/1exc08zvgz2pgcp3w3riy/2024_archive.zip.001?rlkey=1r6k5r9kcyske314tp1qitfua&st=eo1elt94&dl=1',
174
- 'https://www.dropbox.com/scl/fi/u9d0e61euy3p1aq7nkmka/2024_archive.zip.002?rlkey=vwijac5pfwbeyxg7lr5m4f3lx&st=aedjdb8u&dl=1'
175
- ]}
176
- ]
177
-
178
- dataset_10k_record_list = [{'year':2001,'record':'13871721'},
179
- {'year':2002,'record':'13871779'},
180
- {'year':2003,'record':'13871791'},
181
- {'year':2004,'record':'13871811'},
182
- {'year':2005,'record':'13871828'},
183
- {'year':2006,'record':'13871965'},
184
- {'year':2007,'record':'13872346'},
185
- {'year':2008,'record':'13872366'},
186
- {'year':2009,'record':'13872372'},
187
- {'year':2010,'record':'13872374'},
188
- {'year':2011,'record':'13872380'},
189
- {'year':2012,'record':'13872400'},
190
- {'year':2013,'record':'13872494'},
191
- {'year':2014,'record':'13872496'},
192
- {'year':2015,'record':'13872511'},
193
- {'year':2016,'record':'13872528'},
194
- {'year':2017,'record':'13872585'},
195
- {'year':2018,'record':'13872601'},
196
- {'year':2019,'record':'13872609'},
197
- {'year':2020,'record':'13872611'},
198
- {'year':2021,'record':'13872655'},
199
- {'year':2022,'record':'13872647'},
200
- {'year':2023,'record':'13872783'},
201
- {'year':2024,'record':'13872663'}
202
- ]
@@ -1 +0,0 @@
1
- from .sgml_parser import parse_submission
@@ -1,82 +0,0 @@
1
- from pathlib import Path
2
- import re
3
- from .helper import load_file_content, clean_title
4
-
5
- PART_PATTERN = re.compile(r'\n\s*part[.:)?\s]+([IVX]+|\d+)', re.I)
6
- ITEM_PATTERN = re.compile(r'\n\s*item[.:)?\s]+(\d+[A-Z]?)', re.I)
7
- IS_10K_PATTERN = re.compile(r'item[.:)?\s]+14', re.I)
8
- TOC_END_PATTERN = re.compile(r'(?:item[.:)?\s]+14).*?(?=\n\s*item[.:)?\s]+1\b)', re.I | re.DOTALL)
9
-
10
- ROMAN_TO_NUM = {'I': '1', 'II': '2', 'III': '3', 'IV': '4'}
11
-
12
- ITEM_TO_PART = {
13
- '1': 'I', '1A': 'I', '1B': 'I', '1C': 'I', '2': 'I', '3': 'I', '4': 'I',
14
- '5': 'II', '6': 'II', '7': 'II', '7A': 'II', '8': 'II', '9': 'II', '9A': 'II', '9B': 'II', '9C': 'II',
15
- '10': 'III', '11': 'III', '12': 'III', '13': 'III', '14': 'III',
16
- '15': 'IV', '16': 'IV', '16A': 'IV'
17
- }
18
-
19
- def find_content_start(content):
20
- toc_match = TOC_END_PATTERN.search(content)
21
- if toc_match:
22
- item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
23
- item_1_match = item_1_pattern.search(content, toc_match.end())
24
- if item_1_match:
25
- return item_1_match.start()
26
- return 0
27
-
28
- def find_anchors(content):
29
- start_pos = find_content_start(content)
30
- content = '\n' + content[start_pos:]
31
-
32
- anchors = []
33
- for part_match in PART_PATTERN.finditer(content):
34
- anchors.append(('part', part_match.group(1), part_match.start() + start_pos, part_match.group()))
35
-
36
- for item_match in ITEM_PATTERN.finditer(content):
37
- anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
38
-
39
- return sorted(anchors, key=lambda x: x[2])
40
-
41
- def extract_sections(content, anchors, filename):
42
- if not anchors:
43
- return {}
44
-
45
- result = {
46
- "metadata": {"document_name": Path(filename).stem},
47
- "document": {
48
- "part1": {}, "part2": {}, "part3": {}, "part4": {}
49
- }
50
- }
51
-
52
- last_item = None
53
- current_text = None
54
-
55
- for i, current in enumerate(anchors):
56
- if current[0] == 'item':
57
- next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
58
- text = content[current[2]:next_pos].strip()
59
-
60
- if current[1] == last_item:
61
- current_text += "\n\n" + text
62
- else:
63
- if last_item and last_item in ITEM_TO_PART:
64
- part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
65
- result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
66
- current_text = text
67
- last_item = current[1]
68
-
69
- if last_item and last_item in ITEM_TO_PART:
70
- part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
71
- result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
72
-
73
- # Only keep non-empty parts
74
- result["document"] = {k:v for k,v in result["document"].items() if v}
75
- return result
76
-
77
- def parse_10k(filename):
78
- content = load_file_content(filename)
79
- if not IS_10K_PATTERN.search(content):
80
- return {}
81
- anchors = find_anchors(content)
82
- return extract_sections(content, anchors, filename)
@@ -1,73 +0,0 @@
1
- from pathlib import Path
2
- from .helper import load_file_content, clean_title
3
- import re
4
-
5
- PART_II_PATTERN = re.compile(r'\n\s*part\s+II\.?(?:[:\s\.]|$)', re.I)
6
- ITEM_PATTERN = re.compile(r'\n\s*item\s+(\d+[A-Z]?)\.?(?:[:\s\.]|$)', re.I)
7
- TOC_END_PATTERN = re.compile(r'(?:item\s*6\.?).*?(?=\n\s*item\s*1\.?\b)', re.I | re.DOTALL)
8
-
9
- def find_content_start(content):
10
- toc_match = TOC_END_PATTERN.search(content)
11
- if toc_match:
12
- item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
13
- item_1_match = item_1_pattern.search(content, toc_match.end())
14
- if item_1_match:
15
- return item_1_match.start()
16
- return 0
17
-
18
- def find_anchors(content):
19
- start_pos = find_content_start(content)
20
- content = '\n' + content[start_pos:]
21
-
22
- part_ii_match = PART_II_PATTERN.search(content)
23
- part_ii_pos = part_ii_match.start() + start_pos if part_ii_match else None
24
-
25
- anchors = []
26
- for item_match in ITEM_PATTERN.finditer(content):
27
- anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
28
-
29
- return sorted(anchors, key=lambda x: x[2]), part_ii_pos
30
-
31
- def extract_sections(content, anchors_and_part2, filename):
32
- anchors, part2_pos = anchors_and_part2
33
- if not anchors:
34
- return {}
35
-
36
- result = {
37
- "metadata": {"document_name": Path(filename).stem},
38
- "document": {
39
- "part1": {},
40
- "part2": {}
41
- }
42
- }
43
-
44
- last_item = None
45
- current_text = None
46
- last_pos = None
47
-
48
- for i, current in enumerate(anchors):
49
- next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
50
-
51
- if current[1] == last_item:
52
- current_text += "\n\n" + content[current[2]:next_pos].strip()
53
- else:
54
- if last_item is not None:
55
- part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
56
- result["document"][part_key][f"item{last_item.lower()}"] = current_text
57
-
58
- current_text = content[current[2]:next_pos].strip()
59
- last_item = current[1]
60
- last_pos = current[2]
61
-
62
- if last_item is not None:
63
- part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
64
- result["document"][part_key][f"item{last_item.lower()}"] = current_text
65
-
66
- # Clean empty parts
67
- result["document"] = {k:v for k,v in result["document"].items() if v}
68
- return result
69
-
70
- def parse_10q(filename):
71
- content = load_file_content(filename)
72
- anchors_and_part2 = find_anchors(content)
73
- return extract_sections(content, anchors_and_part2, filename)