datamule 0.416__cp310-cp310-macosx_10_9_universal2.whl → 0.418__cp310-cp310-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- datamule/__init__.py +9 -0
- datamule/downloader/downloader.py +364 -0
- datamule/downloader/premiumdownloader.py +332 -0
- datamule/parser/document_parsing/basic_10k_parser.py +82 -0
- datamule/parser/document_parsing/basic_10q_parser.py +73 -0
- datamule/parser/document_parsing/basic_13d_parser.py +58 -0
- datamule/parser/document_parsing/basic_13g_parser.py +61 -0
- datamule/parser/document_parsing/basic_8k_parser.py +84 -0
- datamule/parser/document_parsing/company_concepts_parser.py +0 -0
- datamule/parser/document_parsing/form_d_parser.py +70 -0
- datamule/parser/document_parsing/generalized_item_parser.py +78 -0
- datamule/parser/document_parsing/generalized_xml_parser.py +0 -0
- datamule/parser/document_parsing/helper.py +75 -0
- datamule/parser/document_parsing/information_table_parser_13fhr.py +41 -0
- datamule/parser/document_parsing/insider_trading_parser.py +158 -0
- datamule/parser/document_parsing/mappings.py +95 -0
- datamule/parser/document_parsing/n_port_p_parser.py +70 -0
- datamule/parser/document_parsing/sec_parser.py +73 -0
- datamule/parser/document_parsing/sgml_parser.py +94 -0
- datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so +0 -0
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/METADATA +4 -4
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/RECORD +24 -6
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/WHEEL +0 -0
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import uu
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
|
|
6
|
+
class SimpleSGMLParser:
|
|
7
|
+
def _extract_tag_content(self, line: str) -> tuple[str, str] | None:
|
|
8
|
+
if not (line.startswith('<') and '>' in line):
|
|
9
|
+
return None
|
|
10
|
+
|
|
11
|
+
tag_end = line.index('>')
|
|
12
|
+
tag = line[1:tag_end]
|
|
13
|
+
|
|
14
|
+
if tag.startswith('/'):
|
|
15
|
+
return None
|
|
16
|
+
|
|
17
|
+
content = line[tag_end + 1:].strip()
|
|
18
|
+
return (tag, content)
|
|
19
|
+
|
|
20
|
+
def _write_document(self, content: str, document_info: dict, output_dir: str) -> None:
|
|
21
|
+
if not content:
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
output_path = os.path.join(output_dir, document_info.get('FILENAME', f"{document_info.get('SEQUENCE', 'unknown')}.txt"))
|
|
25
|
+
|
|
26
|
+
first_line = content.partition('\n')[0].strip()
|
|
27
|
+
if first_line.startswith('begin '):
|
|
28
|
+
with BytesIO(content.encode()) as input_file:
|
|
29
|
+
uu.decode(input_file, output_path,quiet=True)
|
|
30
|
+
else:
|
|
31
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
32
|
+
f.write(content)
|
|
33
|
+
|
|
34
|
+
def parse_file(self, filepath: str, output_dir: str) -> None:
|
|
35
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
36
|
+
|
|
37
|
+
submission_data = {}
|
|
38
|
+
documents = []
|
|
39
|
+
current_document = {}
|
|
40
|
+
text_buffer = []
|
|
41
|
+
|
|
42
|
+
in_document = False
|
|
43
|
+
in_text = False
|
|
44
|
+
in_submission = True
|
|
45
|
+
|
|
46
|
+
with open(filepath, 'r', encoding='utf-8') as file:
|
|
47
|
+
for line in file:
|
|
48
|
+
stripped = line.strip()
|
|
49
|
+
|
|
50
|
+
if stripped == '<DOCUMENT>':
|
|
51
|
+
in_document = True
|
|
52
|
+
in_submission = False
|
|
53
|
+
current_document = {}
|
|
54
|
+
|
|
55
|
+
elif stripped == '</DOCUMENT>':
|
|
56
|
+
documents.append(current_document)
|
|
57
|
+
self._write_document(''.join(text_buffer), current_document, output_dir)
|
|
58
|
+
text_buffer = []
|
|
59
|
+
in_document = False
|
|
60
|
+
|
|
61
|
+
elif stripped == '<TEXT>':
|
|
62
|
+
in_text = True
|
|
63
|
+
text_buffer = []
|
|
64
|
+
|
|
65
|
+
elif stripped == '</TEXT>':
|
|
66
|
+
in_text = False
|
|
67
|
+
|
|
68
|
+
elif in_text:
|
|
69
|
+
if stripped not in ['<PDF>', '</PDF>']:
|
|
70
|
+
text_buffer.append(line)
|
|
71
|
+
|
|
72
|
+
else:
|
|
73
|
+
tag_content = self._extract_tag_content(stripped)
|
|
74
|
+
if tag_content:
|
|
75
|
+
key, value = tag_content
|
|
76
|
+
if in_submission:
|
|
77
|
+
submission_data[key] = value
|
|
78
|
+
elif in_document:
|
|
79
|
+
current_document[key] = value
|
|
80
|
+
|
|
81
|
+
metadata = {
|
|
82
|
+
'submission': submission_data,
|
|
83
|
+
'documents': documents
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
|
|
87
|
+
json.dump(metadata, f, indent=4)
|
|
88
|
+
|
|
89
|
+
def parse_sgml_submission(filepath: str, output_dir: str | None = None) -> None:
|
|
90
|
+
if output_dir is None:
|
|
91
|
+
output_dir = os.path.splitext(filepath)[0] + '_output'
|
|
92
|
+
|
|
93
|
+
parser = SimpleSGMLParser()
|
|
94
|
+
parser.parse_file(filepath, output_dir)
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamule
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.418
|
|
4
4
|
Summary: Making it easier to use SEC filings.
|
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
|
6
6
|
Author: John Friedman
|
|
@@ -16,11 +16,11 @@ Requires-Dist: selectolax
|
|
|
16
16
|
Requires-Dist: pytz
|
|
17
17
|
Requires-Dist: zstandard
|
|
18
18
|
Provides-Extra: all
|
|
19
|
-
Requires-Dist: pandas; extra == "all"
|
|
20
19
|
Requires-Dist: openai; extra == "all"
|
|
21
|
-
Requires-Dist: google-generativeai; extra == "all"
|
|
22
|
-
Requires-Dist: psutil; extra == "all"
|
|
23
20
|
Requires-Dist: flask; extra == "all"
|
|
21
|
+
Requires-Dist: psutil; extra == "all"
|
|
22
|
+
Requires-Dist: google-generativeai; extra == "all"
|
|
23
|
+
Requires-Dist: pandas; extra == "all"
|
|
24
24
|
Provides-Extra: dataset_builder
|
|
25
25
|
Requires-Dist: pandas; extra == "dataset-builder"
|
|
26
26
|
Requires-Dist: google-generativeai; extra == "dataset-builder"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datamule/__init__.py,sha256=
|
|
1
|
+
datamule/__init__.py,sha256=i3HnWFrqEdsK2OmvE7NUjcO05w1BuIYBV6J4cSFaN3s,2268
|
|
2
2
|
datamule/document.py,sha256=Yn8UqUjKwYPE29MrMjreHK_HY9eTqOSjPyM5B1VBrHQ,5144
|
|
3
3
|
datamule/helper.py,sha256=tr3AQWus9dHNZFKpLSglWjcb8zmm5qDXjOWACMhvMxQ,4594
|
|
4
4
|
datamule/monitor.py,sha256=mRaM8v5NgcMF9DJ1s_YBzucjrbr-3yFwW422MVml-_Q,9114
|
|
@@ -11,6 +11,8 @@ datamule/data/company_tickers.csv,sha256=GW6lOP54RiGJCx-d9N5jEBy7tGVgU3zI-5xHJXr
|
|
|
11
11
|
datamule/data/sec-glossary.csv,sha256=-cN7GjiadLw5C1sv4zSeCnfeZZDYeSgJl-0ydarMAo0,251209
|
|
12
12
|
datamule/data/xbrl_descriptions.csv,sha256=SQ9wUURNqG424rnTiZtopsxV2q-PvU4NMj52LqgDsvg,2621524
|
|
13
13
|
datamule/dataset_builder/dataset_builder.py,sha256=NCvNbDwlEkA_eAbqbsG--YlqPBDREFTVSM1GJquR0RE,9747
|
|
14
|
+
datamule/downloader/downloader.py,sha256=XJF0FfoCB43_22lhPakq9dI-oJ_SsXIvNx4PUVmcEOc,14509
|
|
15
|
+
datamule/downloader/premiumdownloader.py,sha256=JH4aZ-ZwARCIACKwgzSgHAuOkKPc_GnhiUHSSu22XO4,14206
|
|
14
16
|
datamule/mulebot/__init__.py,sha256=YvZXV6xQ0iP-oGD8rloufjdwJL6D46P3NNr0CY9PQCA,29
|
|
15
17
|
datamule/mulebot/helper.py,sha256=olztOwltfELZ-IERM2bRNLBavD04kfB6ueWTisJAleA,1080
|
|
16
18
|
datamule/mulebot/mulebot.py,sha256=XbtgvXBSFu9OaaLW_k1KDgHVTNQGV8_0ZwNMFad-pPU,5837
|
|
@@ -29,9 +31,25 @@ datamule/mulebot/mulebot_server/static/scripts/suggestions.js,sha256=TCyz8OYuXeI
|
|
|
29
31
|
datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js,sha256=UtkUpLvELNI4Ibpb7VstgVA9Tk-8jbkxXhmXsgufFa4,4437
|
|
30
32
|
datamule/mulebot/mulebot_server/static/scripts/utils.js,sha256=oGPMtyT9dvuqHqrfZj33t4vLZiF8UJrMXB1hpPXRNu4,1255
|
|
31
33
|
datamule/mulebot/mulebot_server/templates/chat-minimalist.html,sha256=MsTbgpnLD0JCQiKKP3XeeNJRNsRqKsRa1j_XXW7nBKw,6975
|
|
34
|
+
datamule/parser/document_parsing/basic_10k_parser.py,sha256=-_q0X9K4KyLccF6j_zNp7FknGPBW1r4U3AT9bPjQUgA,3056
|
|
35
|
+
datamule/parser/document_parsing/basic_10q_parser.py,sha256=ccQc3pwBqevDb6-vBwEE5RTZwRcnrSxjRxZEk_zPO-s,2623
|
|
36
|
+
datamule/parser/document_parsing/basic_13d_parser.py,sha256=loJC97H_ccu_hWMhgNt5tvGZnN3--7tsqZxzBnWB_FY,1528
|
|
37
|
+
datamule/parser/document_parsing/basic_13g_parser.py,sha256=sWg83-QTAzUDNs45iWtpxnMxQgtC3zJlFj0R9ybZpNI,1631
|
|
38
|
+
datamule/parser/document_parsing/basic_8k_parser.py,sha256=inCSmlH_BkLK0Lkvt0kZ6EUJ0nijul_RkdXzccyOmRI,2466
|
|
39
|
+
datamule/parser/document_parsing/company_concepts_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
datamule/parser/document_parsing/form_d_parser.py,sha256=dWlGeVZRzh0kfT3gVMC8eyqeQORdVV3r8KXUwEqAW3s,2036
|
|
41
|
+
datamule/parser/document_parsing/generalized_item_parser.py,sha256=67_DFb1BQbMmdHefEgoCPlEoiUT0zyxh3eBNJpjGXUk,2616
|
|
42
|
+
datamule/parser/document_parsing/generalized_xml_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
datamule/parser/document_parsing/helper.py,sha256=QPhVxLxMSx6Qdi7sR4D4iPObGoTnVD3tXTCNWzNxStg,2533
|
|
44
|
+
datamule/parser/document_parsing/information_table_parser_13fhr.py,sha256=R4Up1oDx3xAlzHwXzVzUkdOSsk8YPuJBPS_3I_bNQSE,1767
|
|
45
|
+
datamule/parser/document_parsing/insider_trading_parser.py,sha256=OVQDeLcfaZtgmOWvWPDotftO6jxx-doFAqBYVqNgypo,7106
|
|
46
|
+
datamule/parser/document_parsing/mappings.py,sha256=VKdnT3C5yPTbB4ZBa4El4jnB-6_osomm2rbJx6Ac6HE,5286
|
|
47
|
+
datamule/parser/document_parsing/n_port_p_parser.py,sha256=GmmQFkCZt57WikUZ5DahtTYMhhk0VcfkhOJusM4Tkow,2224
|
|
48
|
+
datamule/parser/document_parsing/sec_parser.py,sha256=AS8H4h1sfUAdWP2gotULcjbylsYN_nHgTfkeVRyENPo,2716
|
|
49
|
+
datamule/parser/document_parsing/sgml_parser.py,sha256=tC1cL3cdVQPWbc9QtoRUYSo2wRuYNaglFaCmP57oEfA,3317
|
|
32
50
|
datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=UwXSuLGSBLKfO5bM7xTzjFLnGDV3-NNnCjgUCPAY1gk,796689
|
|
33
|
-
datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so,sha256=
|
|
34
|
-
datamule-0.
|
|
35
|
-
datamule-0.
|
|
36
|
-
datamule-0.
|
|
37
|
-
datamule-0.
|
|
51
|
+
datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so,sha256=r7PGnO6cbHnXO1BwDGGuOX6m9qyNtJcPeYpFm1M8t3o,362232
|
|
52
|
+
datamule-0.418.dist-info/METADATA,sha256=3s-xzrQ7TOuColC022WoYWoC2IOntbvhhhqlztGWoB8,1007
|
|
53
|
+
datamule-0.418.dist-info/WHEEL,sha256=Sooupui5EnBW-HNXjQU0OJoerPN0TT-a4xxlvmpUy2g,115
|
|
54
|
+
datamule-0.418.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
|
55
|
+
datamule-0.418.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|