datamule 0.381__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book.py +16 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +278 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.0.dist-info/METADATA +27 -0
- datamule-1.0.0.dist-info/RECORD +40 -0
- {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/__init__.py +0 -0
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.381.dist-info/METADATA +0 -132
- datamule-0.381.dist-info/RECORD +0 -61
- {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
datamule/parser/mappings.py
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
# I will liekely move this file to a more appropriate location in the future
|
2
|
-
|
3
|
-
mapping_dict_10k = {
|
4
|
-
'filing_summary': 'Annual report providing comprehensive overview of company business, financial performance, risks, and operations. Contains audited financial statements, business description, risk analysis, and detailed operational metrics.',
|
5
|
-
|
6
|
-
'structure': {
|
7
|
-
'part1': {
|
8
|
-
'summary': 'Overview of company operations, risks, and material business information. Contains key business strategy, market position, competitive landscape, and significant challenges.',
|
9
|
-
'item1': {
|
10
|
-
'summary': 'Detailed description of business operations including primary products/services, markets served, distribution methods, competitive conditions, regulatory environment, and business segments'
|
11
|
-
},
|
12
|
-
'item1a': {
|
13
|
-
'summary': 'Comprehensive list and explanation of significant risks and uncertainties that could affect business performance, financial condition, and stock value'
|
14
|
-
},
|
15
|
-
'item1b': {
|
16
|
-
'summary': 'Disclosure of any unresolved comments or issues raised by SEC staff regarding company filings'
|
17
|
-
},
|
18
|
-
'item1c': {
|
19
|
-
'summary': 'Information about cybersecurity risks, incidents, risk management, governance, and strategy'
|
20
|
-
},
|
21
|
-
'item2': {
|
22
|
-
'summary': 'Description of principal physical properties, including manufacturing facilities, offices, warehouses, and other significant real estate'
|
23
|
-
},
|
24
|
-
'item3': {
|
25
|
-
'summary': 'Description of material pending legal proceedings, including potential impacts on business'
|
26
|
-
},
|
27
|
-
'item4': {
|
28
|
-
'summary': 'Disclosure of mine safety violations, citations, and orders received under the Mine Act'
|
29
|
-
}
|
30
|
-
},
|
31
|
-
'part2': {
|
32
|
-
'summary': 'Detailed financial performance analysis, including management insights, market risks, and complete audited financial statements.',
|
33
|
-
'item5': {
|
34
|
-
'summary': 'Information about company stock, including market data, price history, dividends, share repurchases, and securities offerings'
|
35
|
-
},
|
36
|
-
'item6': {
|
37
|
-
'summary': 'Selected historical financial data showing trends in financial condition and results over past 5 years'
|
38
|
-
},
|
39
|
-
'item7': {
|
40
|
-
'summary': 'Management\'s analysis of financial condition, operations results, liquidity, capital resources, and future outlook'
|
41
|
-
},
|
42
|
-
'item7a': {
|
43
|
-
'summary': 'Discussion of exposure to market risk including interest rates, foreign exchange, commodities, and hedging activities'
|
44
|
-
},
|
45
|
-
'item8': {
|
46
|
-
'summary': 'Audited financial statements, including balance sheets, income statements, cash flows, and comprehensive notes'
|
47
|
-
},
|
48
|
-
'item9': {
|
49
|
-
'summary': 'Information about changes in independent auditors and any disagreements with them'
|
50
|
-
},
|
51
|
-
'item9a': {
|
52
|
-
'summary': 'Management\'s assessment of internal control effectiveness over financial reporting'
|
53
|
-
},
|
54
|
-
'item9b': {
|
55
|
-
'summary': 'Other significant information not reported elsewhere in the filing'
|
56
|
-
}
|
57
|
-
},
|
58
|
-
'part3': {
|
59
|
-
'summary': 'Information about company leadership, compensation structures, and corporate governance practices.',
|
60
|
-
'item10': {
|
61
|
-
'summary': 'Information about directors and executive officers, including their experience, qualifications, and corporate governance practices'
|
62
|
-
},
|
63
|
-
'item11': {
|
64
|
-
'summary': 'Detailed information about executive compensation, including salary, bonuses, stock awards, and compensation policies'
|
65
|
-
},
|
66
|
-
'item12': {
|
67
|
-
'summary': 'Information about beneficial ownership of securities by management and major shareholders, equity compensation plans'
|
68
|
-
},
|
69
|
-
'item13': {
|
70
|
-
'summary': 'Description of transactions with related parties and potential conflicts of interest'
|
71
|
-
},
|
72
|
-
'item14': {
|
73
|
-
'summary': 'Disclosure of fees paid for audit and non-audit services provided by independent accountants'
|
74
|
-
}
|
75
|
-
},
|
76
|
-
'part4': {
|
77
|
-
'summary': 'Supporting documentation and additional required disclosures.',
|
78
|
-
'item15': {
|
79
|
-
'summary': 'List of all exhibits, including material contracts, corporate documents, and supplementary financial information'
|
80
|
-
},
|
81
|
-
'item16': {
|
82
|
-
'summary': 'Optional summary of key information from the entire Form 10-K filing'
|
83
|
-
}
|
84
|
-
}
|
85
|
-
},
|
86
|
-
|
87
|
-
'search_hints': {
|
88
|
-
'financial_metrics': ['item6', 'item7', 'item8'],
|
89
|
-
'risk_assessment': ['item1a', 'item1c', 'item7a'],
|
90
|
-
'business_overview': ['item1', 'item2'],
|
91
|
-
'leadership_info': ['item10', 'item11'],
|
92
|
-
'material_events': ['item3', 'item9', 'item13'],
|
93
|
-
'operational_data': ['item1', 'item7', 'item2']
|
94
|
-
}
|
95
|
-
}
|
@@ -1,70 +0,0 @@
|
|
1
|
-
from xml.etree import ElementTree as ET
|
2
|
-
|
3
|
-
def element_to_dict(elem):
|
4
|
-
"""Convert an XML element to dict preserving structure."""
|
5
|
-
result = {}
|
6
|
-
|
7
|
-
# Add attributes directly to result
|
8
|
-
if elem.attrib:
|
9
|
-
result.update(elem.attrib)
|
10
|
-
|
11
|
-
# Add text content if present and no children
|
12
|
-
if elem.text and elem.text.strip():
|
13
|
-
text = elem.text.strip()
|
14
|
-
if not len(elem): # No children
|
15
|
-
return text
|
16
|
-
else:
|
17
|
-
result['text'] = text
|
18
|
-
|
19
|
-
# Process children
|
20
|
-
for child in elem:
|
21
|
-
child_data = element_to_dict(child)
|
22
|
-
child_tag = child.tag.split('}')[-1] # Remove namespace
|
23
|
-
|
24
|
-
if child_tag in result:
|
25
|
-
# Convert to list if multiple elements
|
26
|
-
if not isinstance(result[child_tag], list):
|
27
|
-
result[child_tag] = [result[child_tag]]
|
28
|
-
result[child_tag].append(child_data)
|
29
|
-
else:
|
30
|
-
result[child_tag] = child_data
|
31
|
-
|
32
|
-
return result
|
33
|
-
|
34
|
-
def parse_nport_p(filepath):
|
35
|
-
"""Parse NPORT XML file into metadata and document sections."""
|
36
|
-
# Parse XML
|
37
|
-
tree = ET.parse(filepath)
|
38
|
-
root = tree.getroot()
|
39
|
-
|
40
|
-
# Remove namespaces for cleaner processing
|
41
|
-
for elem in root.iter():
|
42
|
-
if '}' in elem.tag:
|
43
|
-
elem.tag = elem.tag.split('}')[-1]
|
44
|
-
|
45
|
-
# Convert entire document to dict
|
46
|
-
full_dict = element_to_dict(root)
|
47
|
-
|
48
|
-
# Separate metadata and document content
|
49
|
-
result = {
|
50
|
-
'metadata': {},
|
51
|
-
'document': {}
|
52
|
-
}
|
53
|
-
|
54
|
-
# Extract metadata sections
|
55
|
-
if 'headerData' in full_dict:
|
56
|
-
result['metadata']['headerData'] = full_dict['headerData']
|
57
|
-
|
58
|
-
if 'formData' in full_dict and 'genInfo' in full_dict['formData']:
|
59
|
-
result['metadata']['genInfo'] = full_dict['formData']['genInfo']
|
60
|
-
|
61
|
-
# Everything else goes to document
|
62
|
-
result['document'] = full_dict
|
63
|
-
|
64
|
-
# Remove metadata sections from document to avoid duplication
|
65
|
-
if 'headerData' in result['document']:
|
66
|
-
del result['document']['headerData']
|
67
|
-
if 'formData' in result['document'] and 'genInfo' in result['document']['formData']:
|
68
|
-
del result['document']['formData']['genInfo']
|
69
|
-
|
70
|
-
return result
|
datamule/parser/sec_parser.py
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
import xml.etree.ElementTree as ET
|
2
|
-
from ..datamule_api import parse_textual_filing
|
3
|
-
from .basic_8k_parser import parse_8k
|
4
|
-
from .basic_10k_parser import parse_10k
|
5
|
-
from .basic_10q_parser import parse_10q
|
6
|
-
from .information_table_parser_13fhr import parse_13f_hr_information_table_xml
|
7
|
-
from .insider_trading_parser import parse_form345
|
8
|
-
from .form_d_parser import parse_form_d
|
9
|
-
from .n_port_p_parser import parse_nport_p
|
10
|
-
from .basic_13d_parser import parse_13d
|
11
|
-
from .basic_13g_parser import parse_13g
|
12
|
-
from .generalized_item_parser import generalized_parser
|
13
|
-
from .mappings import *
|
14
|
-
|
15
|
-
class Parser:
|
16
|
-
|
17
|
-
def __init__(self):
|
18
|
-
pass
|
19
|
-
|
20
|
-
def parse_filing(self, filename, filing_type):
|
21
|
-
# add handling for url vs file
|
22
|
-
# api will handle filing type detection
|
23
|
-
if filing_type == '13F-HR-INFORMATIONTABLE':
|
24
|
-
return parse_13f_hr_information_table_xml(filename)
|
25
|
-
# elif filing_type in ['10-K','10KSB','8-K']:
|
26
|
-
# return generalized_parser(filename)
|
27
|
-
elif filing_type == '8-K':
|
28
|
-
return parse_8k(filename)
|
29
|
-
elif filing_type == '10-K':
|
30
|
-
return parse_10k(filename)
|
31
|
-
elif filing_type == '10-Q':
|
32
|
-
return parse_10q(filename)
|
33
|
-
elif filing_type in ['3', '4', '5']:
|
34
|
-
return parse_form345(filename)
|
35
|
-
elif filing_type == 'D':
|
36
|
-
return parse_form_d(filename)
|
37
|
-
elif filing_type == 'NPORT-P':
|
38
|
-
return parse_nport_p(filename)
|
39
|
-
elif filing_type == 'SC 13D':
|
40
|
-
return parse_13d(filename)
|
41
|
-
elif filing_type == 'SC 13G':
|
42
|
-
return parse_13g(filename)
|
43
|
-
else:
|
44
|
-
data = parse_textual_filing(url=filename, return_type='json')
|
45
|
-
return data
|
46
|
-
|
47
|
-
|
48
|
-
def parse_company_concepts(self, data):
|
49
|
-
|
50
|
-
# get cik
|
51
|
-
cik = data['cik']
|
52
|
-
# get categories
|
53
|
-
categories = list(data['facts'].keys())
|
54
|
-
|
55
|
-
table_dict_list = []
|
56
|
-
for category in categories:
|
57
|
-
for fact in data['facts'][category]:
|
58
|
-
label = data['facts'][category][fact]['label']
|
59
|
-
description = data['facts'][category][fact]['description']
|
60
|
-
units = list(data['facts'][category][fact]['units'].keys())
|
61
|
-
|
62
|
-
for unit in units:
|
63
|
-
table = data['facts'][category][fact]['units'][unit]
|
64
|
-
|
65
|
-
# Find all unique keys across all rows
|
66
|
-
all_keys = set()
|
67
|
-
for row in table:
|
68
|
-
all_keys.update(row.keys())
|
69
|
-
|
70
|
-
# Ensure all rows have all keys
|
71
|
-
for row in table:
|
72
|
-
for key in all_keys:
|
73
|
-
if key not in row:
|
74
|
-
row[key] = None
|
75
|
-
|
76
|
-
table_dict = {'cik':cik, 'category': category, 'fact': fact, 'label': label, 'description': description, 'unit': unit, 'table': table}
|
77
|
-
table_dict_list.append(table_dict)
|
78
|
-
|
79
|
-
return table_dict_list
|
datamule/parser/sgml_parser.py
DELETED
@@ -1,180 +0,0 @@
|
|
1
|
-
import shutil
|
2
|
-
import os
|
3
|
-
import json
|
4
|
-
|
5
|
-
class UUEncodeError(Exception):
|
6
|
-
pass
|
7
|
-
|
8
|
-
def UUdecoder(text):
|
9
|
-
text = text.split('\n')
|
10
|
-
result = bytearray()
|
11
|
-
|
12
|
-
for line in text:
|
13
|
-
if not line or line in ['end', '`']:
|
14
|
-
continue
|
15
|
-
|
16
|
-
length = (ord(line[0]) - 32) & 63
|
17
|
-
chars = line[1:]
|
18
|
-
|
19
|
-
for i in range(0, len(chars), 4):
|
20
|
-
group = chars[i:i+4]
|
21
|
-
if len(group) < 4:
|
22
|
-
break
|
23
|
-
|
24
|
-
n = 0
|
25
|
-
for c in group:
|
26
|
-
n = n * 64 + ((ord(c) - 32) & 63)
|
27
|
-
|
28
|
-
result.append((n >> 16) & 0xFF)
|
29
|
-
if length > 1:
|
30
|
-
result.append((n >> 8) & 0xFF)
|
31
|
-
if length > 2:
|
32
|
-
result.append(n & 0xFF)
|
33
|
-
|
34
|
-
length -= 3
|
35
|
-
if length <= 0:
|
36
|
-
break
|
37
|
-
|
38
|
-
return bytes(result)
|
39
|
-
|
40
|
-
def read_line(line):
|
41
|
-
try:
|
42
|
-
key = line.split('<')[1].split('>')[0]
|
43
|
-
value = ''
|
44
|
-
if key.startswith('/'):
|
45
|
-
return None
|
46
|
-
if line.endswith('>'):
|
47
|
-
return {key: {}}
|
48
|
-
value = line.split('>', 1)[1].strip()
|
49
|
-
return {key: value}
|
50
|
-
except:
|
51
|
-
raise ValueError(f"Could not parse line: {line}")
|
52
|
-
|
53
|
-
def parse_submission(filepath, output_dir):
|
54
|
-
shutil.rmtree(output_dir, ignore_errors=True)
|
55
|
-
os.makedirs(output_dir, exist_ok=True)
|
56
|
-
|
57
|
-
metadata = {
|
58
|
-
'submission': {},
|
59
|
-
'documents': []
|
60
|
-
}
|
61
|
-
|
62
|
-
tag_stack = []
|
63
|
-
path_stack = [metadata['submission']]
|
64
|
-
current_document = None
|
65
|
-
text_content = []
|
66
|
-
last_key = None
|
67
|
-
|
68
|
-
in_text = False
|
69
|
-
is_uuencoded = False
|
70
|
-
collecting_binary = False
|
71
|
-
lines_since_text_start = 0 # Track lines since <TEXT>
|
72
|
-
|
73
|
-
with open(filepath, 'r') as file:
|
74
|
-
for line in file:
|
75
|
-
line = line.rstrip('\n') # Preserve spaces but remove newline
|
76
|
-
|
77
|
-
if line == '<SUBMISSION>':
|
78
|
-
tag_stack.append('SUBMISSION')
|
79
|
-
|
80
|
-
elif line == '</SUBMISSION>':
|
81
|
-
tag_stack.pop()
|
82
|
-
|
83
|
-
elif line == '<DOCUMENT>':
|
84
|
-
current_document = {}
|
85
|
-
metadata['documents'].append(current_document)
|
86
|
-
path_stack = [current_document]
|
87
|
-
tag_stack.append('DOCUMENT')
|
88
|
-
last_key = None
|
89
|
-
|
90
|
-
elif line == '</DOCUMENT>':
|
91
|
-
if current_document and text_content:
|
92
|
-
if 'FILENAME' in current_document:
|
93
|
-
output_path = os.path.join(output_dir, current_document['FILENAME'])
|
94
|
-
elif 'SEQUENCE' in current_document:
|
95
|
-
output_path = os.path.join(output_dir, f'{current_document["SEQUENCE"]}.txt')
|
96
|
-
else:
|
97
|
-
raise ValueError("Document does not have a FILENAME or SEQUENCE")
|
98
|
-
|
99
|
-
if is_uuencoded:
|
100
|
-
content = UUdecoder('\n'.join(text_content))
|
101
|
-
with open(output_path, 'wb') as f:
|
102
|
-
f.write(content)
|
103
|
-
else:
|
104
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
105
|
-
f.write('\n'.join(text_content))
|
106
|
-
|
107
|
-
text_content = []
|
108
|
-
current_document = None
|
109
|
-
path_stack = [metadata['submission']]
|
110
|
-
tag_stack.pop()
|
111
|
-
last_key = None
|
112
|
-
is_uuencoded = False
|
113
|
-
collecting_binary = False
|
114
|
-
lines_since_text_start = 0
|
115
|
-
|
116
|
-
elif line == '<TEXT>':
|
117
|
-
in_text = True
|
118
|
-
text_content = []
|
119
|
-
tag_stack.append('TEXT')
|
120
|
-
last_key = None
|
121
|
-
lines_since_text_start = 0
|
122
|
-
|
123
|
-
elif line == '</TEXT>':
|
124
|
-
in_text = False
|
125
|
-
tag_stack.pop()
|
126
|
-
|
127
|
-
elif in_text:
|
128
|
-
if collecting_binary:
|
129
|
-
if line in ['end', '`']:
|
130
|
-
collecting_binary = False
|
131
|
-
else:
|
132
|
-
text_content.append(line)
|
133
|
-
else:
|
134
|
-
stripped_line = line.strip()
|
135
|
-
if stripped_line == '<PDF>':
|
136
|
-
lines_since_text_start = 0 # Reset counter after PDF marker
|
137
|
-
continue
|
138
|
-
|
139
|
-
if lines_since_text_start == 0 and not stripped_line:
|
140
|
-
# Skip empty lines right after <TEXT> or <PDF>
|
141
|
-
continue
|
142
|
-
|
143
|
-
if lines_since_text_start == 0 and stripped_line.startswith('begin 644'):
|
144
|
-
is_uuencoded = True
|
145
|
-
collecting_binary = True
|
146
|
-
else:
|
147
|
-
if not is_uuencoded:
|
148
|
-
text_content.append(line)
|
149
|
-
lines_since_text_start += 1
|
150
|
-
|
151
|
-
else:
|
152
|
-
if line.startswith('<'):
|
153
|
-
parsed = read_line(line)
|
154
|
-
if parsed is None:
|
155
|
-
if tag_stack:
|
156
|
-
tag_stack.pop()
|
157
|
-
if len(path_stack) > 1:
|
158
|
-
path_stack.pop()
|
159
|
-
last_key = None
|
160
|
-
else:
|
161
|
-
key = list(parsed.keys())[0]
|
162
|
-
value = parsed[key]
|
163
|
-
|
164
|
-
if isinstance(value, dict):
|
165
|
-
current_dict = path_stack[-1]
|
166
|
-
current_dict[key] = {}
|
167
|
-
path_stack.append(current_dict[key])
|
168
|
-
tag_stack.append(key)
|
169
|
-
last_key = None
|
170
|
-
else:
|
171
|
-
current_dict = path_stack[-1]
|
172
|
-
current_dict[key] = value
|
173
|
-
last_key = key
|
174
|
-
elif last_key:
|
175
|
-
current_dict = path_stack[-1]
|
176
|
-
current_dict[last_key] += ' ' + line.strip()
|
177
|
-
|
178
|
-
metadata_path = os.path.join(output_dir, 'metadata.json')
|
179
|
-
with open(metadata_path, 'w') as f:
|
180
|
-
json.dump(metadata, f, indent=4)
|
datamule/sec_filing.py
DELETED
@@ -1,126 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import csv
|
3
|
-
from .parser.sec_parser import Parser
|
4
|
-
from .helper import convert_to_dashed_accession
|
5
|
-
|
6
|
-
class Filing:
|
7
|
-
def __init__(self, filename, filing_type):
|
8
|
-
self.filename = filename
|
9
|
-
self.parser = Parser()
|
10
|
-
self.data = None
|
11
|
-
self.filing_type = filing_type
|
12
|
-
|
13
|
-
def parse_filing(self):
|
14
|
-
self.data = self.parser.parse_filing(self.filename, self.filing_type)
|
15
|
-
return self.data
|
16
|
-
|
17
|
-
def write_json(self, output_filename=None):
|
18
|
-
if not self.data:
|
19
|
-
raise ValueError("No data to write. Parse filing first.")
|
20
|
-
|
21
|
-
if output_filename is None:
|
22
|
-
output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
|
23
|
-
|
24
|
-
with open(output_filename, 'w') as f:
|
25
|
-
json.dump(self.data, f, indent=2)
|
26
|
-
|
27
|
-
def write_csv(self, output_filename=None, accession_number=None):
|
28
|
-
if self.data is None:
|
29
|
-
raise ValueError("No data available. Please call parse_filing() first.")
|
30
|
-
|
31
|
-
if output_filename is None:
|
32
|
-
output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
|
33
|
-
|
34
|
-
with open(output_filename, 'w', newline='') as csvfile:
|
35
|
-
if not self.data:
|
36
|
-
return output_filename
|
37
|
-
|
38
|
-
has_document = any('document' in item for item in self.data)
|
39
|
-
|
40
|
-
if has_document and 'document' in self.data:
|
41
|
-
writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
|
42
|
-
writer.writeheader()
|
43
|
-
flattened = self._flatten_dict(self.data['document'])
|
44
|
-
for section, text in flattened.items():
|
45
|
-
writer.writerow({'section': section, 'text': text})
|
46
|
-
else:
|
47
|
-
fieldnames = list(self.data[0].keys())
|
48
|
-
if accession_number:
|
49
|
-
fieldnames.append('Accession Number')
|
50
|
-
writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
|
51
|
-
writer.writeheader()
|
52
|
-
for row in self.data:
|
53
|
-
if accession_number:
|
54
|
-
row['Accession Number'] = convert_to_dashed_accession(accession_number)
|
55
|
-
writer.writerow(row)
|
56
|
-
|
57
|
-
return output_filename
|
58
|
-
|
59
|
-
def _document_to_section_text(self, document_data, parent_key=''):
|
60
|
-
items = []
|
61
|
-
|
62
|
-
if isinstance(document_data, dict):
|
63
|
-
for key, value in document_data.items():
|
64
|
-
# Build the section name
|
65
|
-
section = f"{parent_key}_{key}" if parent_key else key
|
66
|
-
|
67
|
-
# If the value is a dict, recurse
|
68
|
-
if isinstance(value, dict):
|
69
|
-
items.extend(self._document_to_section_text(value, section))
|
70
|
-
# If it's a list, handle each item
|
71
|
-
elif isinstance(value, list):
|
72
|
-
for i, item in enumerate(value):
|
73
|
-
if isinstance(item, dict):
|
74
|
-
items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
|
75
|
-
else:
|
76
|
-
items.append({
|
77
|
-
'section': f"{section}_{i+1}",
|
78
|
-
'text': str(item)
|
79
|
-
})
|
80
|
-
# Base case - add the item
|
81
|
-
else:
|
82
|
-
items.append({
|
83
|
-
'section': section,
|
84
|
-
'text': str(value)
|
85
|
-
})
|
86
|
-
|
87
|
-
return items
|
88
|
-
|
89
|
-
def _flatten_dict(self, d, parent_key=''):
|
90
|
-
items = {}
|
91
|
-
|
92
|
-
if isinstance(d, list):
|
93
|
-
return [self._flatten_dict(item) for item in d]
|
94
|
-
|
95
|
-
for k, v in d.items():
|
96
|
-
new_key = f"{parent_key}_{k}" if parent_key else k
|
97
|
-
|
98
|
-
if isinstance(v, dict):
|
99
|
-
items.update(self._flatten_dict(v, new_key))
|
100
|
-
else:
|
101
|
-
items[new_key] = str(v)
|
102
|
-
|
103
|
-
return items
|
104
|
-
|
105
|
-
def __iter__(self):
|
106
|
-
if not self.data:
|
107
|
-
self.parse_filing()
|
108
|
-
|
109
|
-
if self.filing_type == '13F-HR-INFORMATIONTABLE':
|
110
|
-
return iter(self.data)
|
111
|
-
elif self.filing_type == '8-K':
|
112
|
-
return iter(self._document_to_section_text(self.data['document']))
|
113
|
-
elif self.filing_type == '10-K':
|
114
|
-
return iter(self._document_to_section_text(self.data['document']))
|
115
|
-
elif self.filing_type == '10-Q':
|
116
|
-
return iter(self._document_to_section_text(self.data['document']))
|
117
|
-
elif self.filing_type in ['3', '4', '5']:
|
118
|
-
return iter(self._flatten_dict(self.data['holdings']))
|
119
|
-
elif self.filing_type == 'D':
|
120
|
-
return iter(self._flatten_dict(self.data['document']['relatedPersonsList']['relatedPersonInfo']))
|
121
|
-
elif self.filing_type == 'NPORT-P':
|
122
|
-
return iter(self._flatten_dict(self.data['document']['formData']['invstOrSecs']['invstOrSec']))
|
123
|
-
elif self.filing_type == 'SC 13D':
|
124
|
-
return iter(self._document_to_section_text(self.data['document']))
|
125
|
-
elif self.filing_type == 'SC 13G':
|
126
|
-
return iter(self._document_to_section_text(self.data['document']))
|
datamule/sec_search.py
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import re
|
3
|
-
|
4
|
-
def search(directory, keyword):
|
5
|
-
results = []
|
6
|
-
keyword = keyword.lower() # Convert keyword to lowercase
|
7
|
-
|
8
|
-
for root, dirs, files in os.walk(directory):
|
9
|
-
for file in files:
|
10
|
-
if file.endswith(('.xml', '.html', '.txt','.htm')): # Add more extensions if needed
|
11
|
-
file_path = os.path.join(root, file)
|
12
|
-
try:
|
13
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
14
|
-
content = f.read().lower() # Convert content to lowercase
|
15
|
-
if keyword in content: # Simple string search instead of regex
|
16
|
-
results.append(file_path)
|
17
|
-
except Exception as e:
|
18
|
-
print(f"Error reading {file_path}: {str(e)}")
|
19
|
-
|
20
|
-
return results
|