datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.380.dist-info/METADATA +0 -110
  72. datamule-0.380.dist-info/RECORD +0 -61
  73. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,95 +0,0 @@
1
- # I will liekely move this file to a more appropriate location in the future
2
-
3
- mapping_dict_10k = {
4
- 'filing_summary': 'Annual report providing comprehensive overview of company business, financial performance, risks, and operations. Contains audited financial statements, business description, risk analysis, and detailed operational metrics.',
5
-
6
- 'structure': {
7
- 'part1': {
8
- 'summary': 'Overview of company operations, risks, and material business information. Contains key business strategy, market position, competitive landscape, and significant challenges.',
9
- 'item1': {
10
- 'summary': 'Detailed description of business operations including primary products/services, markets served, distribution methods, competitive conditions, regulatory environment, and business segments'
11
- },
12
- 'item1a': {
13
- 'summary': 'Comprehensive list and explanation of significant risks and uncertainties that could affect business performance, financial condition, and stock value'
14
- },
15
- 'item1b': {
16
- 'summary': 'Disclosure of any unresolved comments or issues raised by SEC staff regarding company filings'
17
- },
18
- 'item1c': {
19
- 'summary': 'Information about cybersecurity risks, incidents, risk management, governance, and strategy'
20
- },
21
- 'item2': {
22
- 'summary': 'Description of principal physical properties, including manufacturing facilities, offices, warehouses, and other significant real estate'
23
- },
24
- 'item3': {
25
- 'summary': 'Description of material pending legal proceedings, including potential impacts on business'
26
- },
27
- 'item4': {
28
- 'summary': 'Disclosure of mine safety violations, citations, and orders received under the Mine Act'
29
- }
30
- },
31
- 'part2': {
32
- 'summary': 'Detailed financial performance analysis, including management insights, market risks, and complete audited financial statements.',
33
- 'item5': {
34
- 'summary': 'Information about company stock, including market data, price history, dividends, share repurchases, and securities offerings'
35
- },
36
- 'item6': {
37
- 'summary': 'Selected historical financial data showing trends in financial condition and results over past 5 years'
38
- },
39
- 'item7': {
40
- 'summary': 'Management\'s analysis of financial condition, operations results, liquidity, capital resources, and future outlook'
41
- },
42
- 'item7a': {
43
- 'summary': 'Discussion of exposure to market risk including interest rates, foreign exchange, commodities, and hedging activities'
44
- },
45
- 'item8': {
46
- 'summary': 'Audited financial statements, including balance sheets, income statements, cash flows, and comprehensive notes'
47
- },
48
- 'item9': {
49
- 'summary': 'Information about changes in independent auditors and any disagreements with them'
50
- },
51
- 'item9a': {
52
- 'summary': 'Management\'s assessment of internal control effectiveness over financial reporting'
53
- },
54
- 'item9b': {
55
- 'summary': 'Other significant information not reported elsewhere in the filing'
56
- }
57
- },
58
- 'part3': {
59
- 'summary': 'Information about company leadership, compensation structures, and corporate governance practices.',
60
- 'item10': {
61
- 'summary': 'Information about directors and executive officers, including their experience, qualifications, and corporate governance practices'
62
- },
63
- 'item11': {
64
- 'summary': 'Detailed information about executive compensation, including salary, bonuses, stock awards, and compensation policies'
65
- },
66
- 'item12': {
67
- 'summary': 'Information about beneficial ownership of securities by management and major shareholders, equity compensation plans'
68
- },
69
- 'item13': {
70
- 'summary': 'Description of transactions with related parties and potential conflicts of interest'
71
- },
72
- 'item14': {
73
- 'summary': 'Disclosure of fees paid for audit and non-audit services provided by independent accountants'
74
- }
75
- },
76
- 'part4': {
77
- 'summary': 'Supporting documentation and additional required disclosures.',
78
- 'item15': {
79
- 'summary': 'List of all exhibits, including material contracts, corporate documents, and supplementary financial information'
80
- },
81
- 'item16': {
82
- 'summary': 'Optional summary of key information from the entire Form 10-K filing'
83
- }
84
- }
85
- },
86
-
87
- 'search_hints': {
88
- 'financial_metrics': ['item6', 'item7', 'item8'],
89
- 'risk_assessment': ['item1a', 'item1c', 'item7a'],
90
- 'business_overview': ['item1', 'item2'],
91
- 'leadership_info': ['item10', 'item11'],
92
- 'material_events': ['item3', 'item9', 'item13'],
93
- 'operational_data': ['item1', 'item7', 'item2']
94
- }
95
- }
@@ -1,70 +0,0 @@
1
- from xml.etree import ElementTree as ET
2
-
3
- def element_to_dict(elem):
4
- """Convert an XML element to dict preserving structure."""
5
- result = {}
6
-
7
- # Add attributes directly to result
8
- if elem.attrib:
9
- result.update(elem.attrib)
10
-
11
- # Add text content if present and no children
12
- if elem.text and elem.text.strip():
13
- text = elem.text.strip()
14
- if not len(elem): # No children
15
- return text
16
- else:
17
- result['text'] = text
18
-
19
- # Process children
20
- for child in elem:
21
- child_data = element_to_dict(child)
22
- child_tag = child.tag.split('}')[-1] # Remove namespace
23
-
24
- if child_tag in result:
25
- # Convert to list if multiple elements
26
- if not isinstance(result[child_tag], list):
27
- result[child_tag] = [result[child_tag]]
28
- result[child_tag].append(child_data)
29
- else:
30
- result[child_tag] = child_data
31
-
32
- return result
33
-
34
- def parse_nport_p(filepath):
35
- """Parse NPORT XML file into metadata and document sections."""
36
- # Parse XML
37
- tree = ET.parse(filepath)
38
- root = tree.getroot()
39
-
40
- # Remove namespaces for cleaner processing
41
- for elem in root.iter():
42
- if '}' in elem.tag:
43
- elem.tag = elem.tag.split('}')[-1]
44
-
45
- # Convert entire document to dict
46
- full_dict = element_to_dict(root)
47
-
48
- # Separate metadata and document content
49
- result = {
50
- 'metadata': {},
51
- 'document': {}
52
- }
53
-
54
- # Extract metadata sections
55
- if 'headerData' in full_dict:
56
- result['metadata']['headerData'] = full_dict['headerData']
57
-
58
- if 'formData' in full_dict and 'genInfo' in full_dict['formData']:
59
- result['metadata']['genInfo'] = full_dict['formData']['genInfo']
60
-
61
- # Everything else goes to document
62
- result['document'] = full_dict
63
-
64
- # Remove metadata sections from document to avoid duplication
65
- if 'headerData' in result['document']:
66
- del result['document']['headerData']
67
- if 'formData' in result['document'] and 'genInfo' in result['document']['formData']:
68
- del result['document']['formData']['genInfo']
69
-
70
- return result
@@ -1,79 +0,0 @@
1
- import xml.etree.ElementTree as ET
2
- from ..datamule_api import parse_textual_filing
3
- from .basic_8k_parser import parse_8k
4
- from .basic_10k_parser import parse_10k
5
- from .basic_10q_parser import parse_10q
6
- from .information_table_parser_13fhr import parse_13f_hr_information_table_xml
7
- from .insider_trading_parser import parse_form345
8
- from .form_d_parser import parse_form_d
9
- from .n_port_p_parser import parse_nport_p
10
- from .basic_13d_parser import parse_13d
11
- from .basic_13g_parser import parse_13g
12
- from .generalized_item_parser import generalized_parser
13
- from .mappings import *
14
-
15
- class Parser:
16
-
17
- def __init__(self):
18
- pass
19
-
20
- def parse_filing(self, filename, filing_type):
21
- # add handling for url vs file
22
- # api will handle filing type detection
23
- if filing_type == '13F-HR-INFORMATIONTABLE':
24
- return parse_13f_hr_information_table_xml(filename)
25
- # elif filing_type in ['10-K','10KSB','8-K']:
26
- # return generalized_parser(filename)
27
- elif filing_type == '8-K':
28
- return parse_8k(filename)
29
- elif filing_type == '10-K':
30
- return parse_10k(filename)
31
- elif filing_type == '10-Q':
32
- return parse_10q(filename)
33
- elif filing_type in ['3', '4', '5']:
34
- return parse_form345(filename)
35
- elif filing_type == 'D':
36
- return parse_form_d(filename)
37
- elif filing_type == 'NPORT-P':
38
- return parse_nport_p(filename)
39
- elif filing_type == 'SC 13D':
40
- return parse_13d(filename)
41
- elif filing_type == 'SC 13G':
42
- return parse_13g(filename)
43
- else:
44
- data = parse_textual_filing(url=filename, return_type='json')
45
- return data
46
-
47
-
48
- def parse_company_concepts(self, data):
49
-
50
- # get cik
51
- cik = data['cik']
52
- # get categories
53
- categories = list(data['facts'].keys())
54
-
55
- table_dict_list = []
56
- for category in categories:
57
- for fact in data['facts'][category]:
58
- label = data['facts'][category][fact]['label']
59
- description = data['facts'][category][fact]['description']
60
- units = list(data['facts'][category][fact]['units'].keys())
61
-
62
- for unit in units:
63
- table = data['facts'][category][fact]['units'][unit]
64
-
65
- # Find all unique keys across all rows
66
- all_keys = set()
67
- for row in table:
68
- all_keys.update(row.keys())
69
-
70
- # Ensure all rows have all keys
71
- for row in table:
72
- for key in all_keys:
73
- if key not in row:
74
- row[key] = None
75
-
76
- table_dict = {'cik':cik, 'category': category, 'fact': fact, 'label': label, 'description': description, 'unit': unit, 'table': table}
77
- table_dict_list.append(table_dict)
78
-
79
- return table_dict_list
@@ -1,180 +0,0 @@
1
- import shutil
2
- import os
3
- import json
4
-
5
- class UUEncodeError(Exception):
6
- pass
7
-
8
- def UUdecoder(text):
9
- text = text.split('\n')
10
- result = bytearray()
11
-
12
- for line in text:
13
- if not line or line in ['end', '`']:
14
- continue
15
-
16
- length = (ord(line[0]) - 32) & 63
17
- chars = line[1:]
18
-
19
- for i in range(0, len(chars), 4):
20
- group = chars[i:i+4]
21
- if len(group) < 4:
22
- break
23
-
24
- n = 0
25
- for c in group:
26
- n = n * 64 + ((ord(c) - 32) & 63)
27
-
28
- result.append((n >> 16) & 0xFF)
29
- if length > 1:
30
- result.append((n >> 8) & 0xFF)
31
- if length > 2:
32
- result.append(n & 0xFF)
33
-
34
- length -= 3
35
- if length <= 0:
36
- break
37
-
38
- return bytes(result)
39
-
40
- def read_line(line):
41
- try:
42
- key = line.split('<')[1].split('>')[0]
43
- value = ''
44
- if key.startswith('/'):
45
- return None
46
- if line.endswith('>'):
47
- return {key: {}}
48
- value = line.split('>', 1)[1].strip()
49
- return {key: value}
50
- except:
51
- raise ValueError(f"Could not parse line: {line}")
52
-
53
- def parse_submission(filepath, output_dir):
54
- shutil.rmtree(output_dir, ignore_errors=True)
55
- os.makedirs(output_dir, exist_ok=True)
56
-
57
- metadata = {
58
- 'submission': {},
59
- 'documents': []
60
- }
61
-
62
- tag_stack = []
63
- path_stack = [metadata['submission']]
64
- current_document = None
65
- text_content = []
66
- last_key = None
67
-
68
- in_text = False
69
- is_uuencoded = False
70
- collecting_binary = False
71
- lines_since_text_start = 0 # Track lines since <TEXT>
72
-
73
- with open(filepath, 'r') as file:
74
- for line in file:
75
- line = line.rstrip('\n') # Preserve spaces but remove newline
76
-
77
- if line == '<SUBMISSION>':
78
- tag_stack.append('SUBMISSION')
79
-
80
- elif line == '</SUBMISSION>':
81
- tag_stack.pop()
82
-
83
- elif line == '<DOCUMENT>':
84
- current_document = {}
85
- metadata['documents'].append(current_document)
86
- path_stack = [current_document]
87
- tag_stack.append('DOCUMENT')
88
- last_key = None
89
-
90
- elif line == '</DOCUMENT>':
91
- if current_document and text_content:
92
- if 'FILENAME' in current_document:
93
- output_path = os.path.join(output_dir, current_document['FILENAME'])
94
- elif 'SEQUENCE' in current_document:
95
- output_path = os.path.join(output_dir, f'{current_document["SEQUENCE"]}.txt')
96
- else:
97
- raise ValueError("Document does not have a FILENAME or SEQUENCE")
98
-
99
- if is_uuencoded:
100
- content = UUdecoder('\n'.join(text_content))
101
- with open(output_path, 'wb') as f:
102
- f.write(content)
103
- else:
104
- with open(output_path, 'w', encoding='utf-8') as f:
105
- f.write('\n'.join(text_content))
106
-
107
- text_content = []
108
- current_document = None
109
- path_stack = [metadata['submission']]
110
- tag_stack.pop()
111
- last_key = None
112
- is_uuencoded = False
113
- collecting_binary = False
114
- lines_since_text_start = 0
115
-
116
- elif line == '<TEXT>':
117
- in_text = True
118
- text_content = []
119
- tag_stack.append('TEXT')
120
- last_key = None
121
- lines_since_text_start = 0
122
-
123
- elif line == '</TEXT>':
124
- in_text = False
125
- tag_stack.pop()
126
-
127
- elif in_text:
128
- if collecting_binary:
129
- if line in ['end', '`']:
130
- collecting_binary = False
131
- else:
132
- text_content.append(line)
133
- else:
134
- stripped_line = line.strip()
135
- if stripped_line == '<PDF>':
136
- lines_since_text_start = 0 # Reset counter after PDF marker
137
- continue
138
-
139
- if lines_since_text_start == 0 and not stripped_line:
140
- # Skip empty lines right after <TEXT> or <PDF>
141
- continue
142
-
143
- if lines_since_text_start == 0 and stripped_line.startswith('begin 644'):
144
- is_uuencoded = True
145
- collecting_binary = True
146
- else:
147
- if not is_uuencoded:
148
- text_content.append(line)
149
- lines_since_text_start += 1
150
-
151
- else:
152
- if line.startswith('<'):
153
- parsed = read_line(line)
154
- if parsed is None:
155
- if tag_stack:
156
- tag_stack.pop()
157
- if len(path_stack) > 1:
158
- path_stack.pop()
159
- last_key = None
160
- else:
161
- key = list(parsed.keys())[0]
162
- value = parsed[key]
163
-
164
- if isinstance(value, dict):
165
- current_dict = path_stack[-1]
166
- current_dict[key] = {}
167
- path_stack.append(current_dict[key])
168
- tag_stack.append(key)
169
- last_key = None
170
- else:
171
- current_dict = path_stack[-1]
172
- current_dict[key] = value
173
- last_key = key
174
- elif last_key:
175
- current_dict = path_stack[-1]
176
- current_dict[last_key] += ' ' + line.strip()
177
-
178
- metadata_path = os.path.join(output_dir, 'metadata.json')
179
- with open(metadata_path, 'w') as f:
180
- json.dump(metadata, f, indent=4)
datamule/sec_filing.py DELETED
@@ -1,126 +0,0 @@
1
- import json
2
- import csv
3
- from .parser.sec_parser import Parser
4
- from .helper import convert_to_dashed_accession
5
-
6
- class Filing:
7
- def __init__(self, filename, filing_type):
8
- self.filename = filename
9
- self.parser = Parser()
10
- self.data = None
11
- self.filing_type = filing_type
12
-
13
- def parse_filing(self):
14
- self.data = self.parser.parse_filing(self.filename, self.filing_type)
15
- return self.data
16
-
17
- def write_json(self, output_filename=None):
18
- if not self.data:
19
- raise ValueError("No data to write. Parse filing first.")
20
-
21
- if output_filename is None:
22
- output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
23
-
24
- with open(output_filename, 'w') as f:
25
- json.dump(self.data, f, indent=2)
26
-
27
- def write_csv(self, output_filename=None, accession_number=None):
28
- if self.data is None:
29
- raise ValueError("No data available. Please call parse_filing() first.")
30
-
31
- if output_filename is None:
32
- output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
33
-
34
- with open(output_filename, 'w', newline='') as csvfile:
35
- if not self.data:
36
- return output_filename
37
-
38
- has_document = any('document' in item for item in self.data)
39
-
40
- if has_document and 'document' in self.data:
41
- writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
42
- writer.writeheader()
43
- flattened = self._flatten_dict(self.data['document'])
44
- for section, text in flattened.items():
45
- writer.writerow({'section': section, 'text': text})
46
- else:
47
- fieldnames = list(self.data[0].keys())
48
- if accession_number:
49
- fieldnames.append('Accession Number')
50
- writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
51
- writer.writeheader()
52
- for row in self.data:
53
- if accession_number:
54
- row['Accession Number'] = convert_to_dashed_accession(accession_number)
55
- writer.writerow(row)
56
-
57
- return output_filename
58
-
59
- def _document_to_section_text(self, document_data, parent_key=''):
60
- items = []
61
-
62
- if isinstance(document_data, dict):
63
- for key, value in document_data.items():
64
- # Build the section name
65
- section = f"{parent_key}_{key}" if parent_key else key
66
-
67
- # If the value is a dict, recurse
68
- if isinstance(value, dict):
69
- items.extend(self._document_to_section_text(value, section))
70
- # If it's a list, handle each item
71
- elif isinstance(value, list):
72
- for i, item in enumerate(value):
73
- if isinstance(item, dict):
74
- items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
75
- else:
76
- items.append({
77
- 'section': f"{section}_{i+1}",
78
- 'text': str(item)
79
- })
80
- # Base case - add the item
81
- else:
82
- items.append({
83
- 'section': section,
84
- 'text': str(value)
85
- })
86
-
87
- return items
88
-
89
- def _flatten_dict(self, d, parent_key=''):
90
- items = {}
91
-
92
- if isinstance(d, list):
93
- return [self._flatten_dict(item) for item in d]
94
-
95
- for k, v in d.items():
96
- new_key = f"{parent_key}_{k}" if parent_key else k
97
-
98
- if isinstance(v, dict):
99
- items.update(self._flatten_dict(v, new_key))
100
- else:
101
- items[new_key] = str(v)
102
-
103
- return items
104
-
105
- def __iter__(self):
106
- if not self.data:
107
- self.parse_filing()
108
-
109
- if self.filing_type == '13F-HR-INFORMATIONTABLE':
110
- return iter(self.data)
111
- elif self.filing_type == '8-K':
112
- return iter(self._document_to_section_text(self.data['document']))
113
- elif self.filing_type == '10-K':
114
- return iter(self._document_to_section_text(self.data['document']))
115
- elif self.filing_type == '10-Q':
116
- return iter(self._document_to_section_text(self.data['document']))
117
- elif self.filing_type in ['3', '4', '5']:
118
- return iter(self._flatten_dict(self.data['holdings']))
119
- elif self.filing_type == 'D':
120
- return iter(self._flatten_dict(self.data['document']['relatedPersonsList']['relatedPersonInfo']))
121
- elif self.filing_type == 'NPORT-P':
122
- return iter(self._flatten_dict(self.data['document']['formData']['invstOrSecs']['invstOrSec']))
123
- elif self.filing_type == 'SC 13D':
124
- return iter(self._document_to_section_text(self.data['document']))
125
- elif self.filing_type == 'SC 13G':
126
- return iter(self._document_to_section_text(self.data['document']))
datamule/sec_search.py DELETED
@@ -1,20 +0,0 @@
1
- import os
2
- import re
3
-
4
- def search(directory, keyword):
5
- results = []
6
- keyword = keyword.lower() # Convert keyword to lowercase
7
-
8
- for root, dirs, files in os.walk(directory):
9
- for file in files:
10
- if file.endswith(('.xml', '.html', '.txt','.htm')): # Add more extensions if needed
11
- file_path = os.path.join(root, file)
12
- try:
13
- with open(file_path, 'r', encoding='utf-8') as f:
14
- content = f.read().lower() # Convert content to lowercase
15
- if keyword in content: # Simple string search instead of regex
16
- results.append(file_path)
17
- except Exception as e:
18
- print(f"Error reading {file_path}: {str(e)}")
19
-
20
- return results
@@ -1,110 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: datamule
3
- Version: 0.380
4
- Summary: Making it easier to use SEC filings.
5
- Home-page: https://github.com/john-friedman/datamule-python
6
- Author: John Friedman
7
- Description-Content-Type: text/markdown
8
- Requires-Dist: aiohttp
9
- Requires-Dist: aiolimiter
10
- Requires-Dist: tqdm
11
- Requires-Dist: requests
12
- Requires-Dist: nest-asyncio
13
- Requires-Dist: aiofiles
14
- Requires-Dist: polars
15
- Requires-Dist: setuptools
16
- Requires-Dist: selectolax
17
- Provides-Extra: all
18
- Requires-Dist: openai ; extra == 'all'
19
- Requires-Dist: flask ; extra == 'all'
20
- Requires-Dist: lxml ; extra == 'all'
21
- Requires-Dist: google-generativeai ; extra == 'all'
22
- Requires-Dist: pandas ; extra == 'all'
23
- Requires-Dist: psutil ; extra == 'all'
24
- Provides-Extra: dataset_builder
25
- Requires-Dist: pandas ; extra == 'dataset_builder'
26
- Requires-Dist: google-generativeai ; extra == 'dataset_builder'
27
- Requires-Dist: psutil ; extra == 'dataset_builder'
28
- Provides-Extra: filing_viewer
29
- Requires-Dist: lxml ; extra == 'filing_viewer'
30
- Provides-Extra: mulebot
31
- Requires-Dist: openai ; extra == 'mulebot'
32
- Provides-Extra: mulebot_server
33
- Requires-Dist: flask ; extra == 'mulebot_server'
34
-
35
- # datamule
36
-
37
- [![Downloads](https://static.pepy.tech/badge/datamule)](https://pepy.tech/project/datamule)
38
- [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fjohn-friedman%2Fdatamule-python&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com)
39
- ![GitHub](https://img.shields.io/github/stars/john-friedman/datamule-python)
40
-
41
- A Python package for working with SEC filings at scale. Includes [Mulebot](https://chat.datamule.xyz/), an open-source chatbot for SEC data with no storage requirements. 📚 [Full Documentation](https://john-friedman.github.io/datamule-python/) | 🌐 [Website](https://datamule.xyz/)
42
-
43
- Articles:
44
- - [Article: Creating Structured Datasets from SEC filings](https://medium.com/@jgfriedman99/how-to-create-alternative-datasets-using-datamule-d3a0192da8f6)
45
- - [Article: Deploy a Financial Chatbot in 5 Minutes](https://medium.com/@jgfriedman99/how-to-deploy-a-financial-chatbot-in-5-minutes-ef5eec973d4c)
46
-
47
- ## Key Features
48
-
49
- - 📥 [Download SEC filings quickly and efficiently](https://john-friedman.github.io/datamule-python/usage/downloader.html#)
50
- - 🔍 [Monitor EDGAR for new filings in real-time](https://john-friedman.github.io/datamule-python/usage/downloader.html#monitoring-new-filings)
51
- - 📊 [Parse filings at scale](https://john-friedman.github.io/datamule-python/usage/parsing.html#filing-parser)
52
- - 💾 [Access comprehensive datasets (10-Ks, SIC codes, etc.)](https://john-friedman.github.io/datamule-python/datasets.html)
53
- - 🤖 [Interact with SEC data using MuleBot](https://john-friedman.github.io/datamule-python/usage/mulebot.html)
54
-
55
- ## Quick Start
56
-
57
- ```bash
58
- # Basic installation
59
- pip install datamule
60
-
61
- # Install with all features
62
- pip install datamule[all]
63
- ```
64
-
65
- ```python
66
- import datamule as dm
67
-
68
- # Download filings
69
- downloader = dm.Downloader()
70
- downloader.download(form='10-K', ticker='AAPL')
71
-
72
- # Download filing attachments such as information tables
73
- downloader.download(form='13F-HR',file_types=['INFORMATION TABLE'],date=('2024-09-14','2024-09-16'))
74
-
75
- # Download every 10Q from 2023. Should take 2 minutes
76
- downloader.download_dataset(dataset='10q_2023')
77
- ```
78
-
79
- ## Available Extras
80
-
81
- - `filing_viewer`: Filing viewer module
82
- - `mulebot`: SEC data interaction chatbot
83
- - `mulebot_server`: Flask server for MuleBot
84
- - `all`: All available features
85
-
86
- ## Resources
87
-
88
- - 📊 [SEC Filing Glossary](https://datamule.xyz/sec_glossary)
89
- - 📈 [XBRL Fact Glossary](https://datamule.xyz/xbrl_fact_glossary)
90
- - 🤖 [Try MuleBot](https://chat.datamule.xyz/)
91
-
92
- ## Datasets
93
-
94
- Access comprehensive SEC datasets including:
95
- - Historical FTD data (since 2004)
96
- - 10-K and 10-Q filings (since 2001)
97
- - 13F-HR Information Tables (since 2013)
98
- - MD&A collection (100,000+ since 2001, requires free API key)
99
-
100
- ## Contributing
101
-
102
- Contributions are welcome! Please feel free to submit a Pull Request.
103
-
104
- ## License
105
-
106
- [MIT License](LICENSE)
107
-
108
- ---
109
-
110
- For detailed usage examples, API reference, and advanced features, please visit our [documentation](https://john-friedman.github.io/datamule-python/).