datamule 0.416__cp39-cp39-win_amd64.whl → 0.418__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- datamule/__init__.py +9 -0
- datamule/downloader/downloader.py +364 -0
- datamule/downloader/premiumdownloader.py +332 -0
- datamule/parser/document_parsing/basic_10k_parser.py +82 -0
- datamule/parser/document_parsing/basic_10q_parser.py +73 -0
- datamule/parser/document_parsing/basic_13d_parser.py +58 -0
- datamule/parser/document_parsing/basic_13g_parser.py +61 -0
- datamule/parser/document_parsing/basic_8k_parser.py +84 -0
- datamule/parser/document_parsing/company_concepts_parser.py +0 -0
- datamule/parser/document_parsing/form_d_parser.py +70 -0
- datamule/parser/document_parsing/generalized_item_parser.py +78 -0
- datamule/parser/document_parsing/generalized_xml_parser.py +0 -0
- datamule/parser/document_parsing/helper.py +75 -0
- datamule/parser/document_parsing/information_table_parser_13fhr.py +41 -0
- datamule/parser/document_parsing/insider_trading_parser.py +158 -0
- datamule/parser/document_parsing/mappings.py +95 -0
- datamule/parser/document_parsing/n_port_p_parser.py +70 -0
- datamule/parser/document_parsing/sec_parser.py +73 -0
- datamule/parser/document_parsing/sgml_parser.py +94 -0
- datamule/parser/sgml_parsing/sgml_parser_cy.cp39-win_amd64.pyd +0 -0
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/METADATA +2 -2
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/RECORD +24 -6
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/WHEEL +0 -0
- {datamule-0.416.dist-info → datamule-0.418.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import uu
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
|
|
6
|
+
class SimpleSGMLParser:
|
|
7
|
+
def _extract_tag_content(self, line: str) -> tuple[str, str] | None:
|
|
8
|
+
if not (line.startswith('<') and '>' in line):
|
|
9
|
+
return None
|
|
10
|
+
|
|
11
|
+
tag_end = line.index('>')
|
|
12
|
+
tag = line[1:tag_end]
|
|
13
|
+
|
|
14
|
+
if tag.startswith('/'):
|
|
15
|
+
return None
|
|
16
|
+
|
|
17
|
+
content = line[tag_end + 1:].strip()
|
|
18
|
+
return (tag, content)
|
|
19
|
+
|
|
20
|
+
def _write_document(self, content: str, document_info: dict, output_dir: str) -> None:
|
|
21
|
+
if not content:
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
output_path = os.path.join(output_dir, document_info.get('FILENAME', f"{document_info.get('SEQUENCE', 'unknown')}.txt"))
|
|
25
|
+
|
|
26
|
+
first_line = content.partition('\n')[0].strip()
|
|
27
|
+
if first_line.startswith('begin '):
|
|
28
|
+
with BytesIO(content.encode()) as input_file:
|
|
29
|
+
uu.decode(input_file, output_path,quiet=True)
|
|
30
|
+
else:
|
|
31
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
32
|
+
f.write(content)
|
|
33
|
+
|
|
34
|
+
def parse_file(self, filepath: str, output_dir: str) -> None:
|
|
35
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
36
|
+
|
|
37
|
+
submission_data = {}
|
|
38
|
+
documents = []
|
|
39
|
+
current_document = {}
|
|
40
|
+
text_buffer = []
|
|
41
|
+
|
|
42
|
+
in_document = False
|
|
43
|
+
in_text = False
|
|
44
|
+
in_submission = True
|
|
45
|
+
|
|
46
|
+
with open(filepath, 'r', encoding='utf-8') as file:
|
|
47
|
+
for line in file:
|
|
48
|
+
stripped = line.strip()
|
|
49
|
+
|
|
50
|
+
if stripped == '<DOCUMENT>':
|
|
51
|
+
in_document = True
|
|
52
|
+
in_submission = False
|
|
53
|
+
current_document = {}
|
|
54
|
+
|
|
55
|
+
elif stripped == '</DOCUMENT>':
|
|
56
|
+
documents.append(current_document)
|
|
57
|
+
self._write_document(''.join(text_buffer), current_document, output_dir)
|
|
58
|
+
text_buffer = []
|
|
59
|
+
in_document = False
|
|
60
|
+
|
|
61
|
+
elif stripped == '<TEXT>':
|
|
62
|
+
in_text = True
|
|
63
|
+
text_buffer = []
|
|
64
|
+
|
|
65
|
+
elif stripped == '</TEXT>':
|
|
66
|
+
in_text = False
|
|
67
|
+
|
|
68
|
+
elif in_text:
|
|
69
|
+
if stripped not in ['<PDF>', '</PDF>']:
|
|
70
|
+
text_buffer.append(line)
|
|
71
|
+
|
|
72
|
+
else:
|
|
73
|
+
tag_content = self._extract_tag_content(stripped)
|
|
74
|
+
if tag_content:
|
|
75
|
+
key, value = tag_content
|
|
76
|
+
if in_submission:
|
|
77
|
+
submission_data[key] = value
|
|
78
|
+
elif in_document:
|
|
79
|
+
current_document[key] = value
|
|
80
|
+
|
|
81
|
+
metadata = {
|
|
82
|
+
'submission': submission_data,
|
|
83
|
+
'documents': documents
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
|
|
87
|
+
json.dump(metadata, f, indent=4)
|
|
88
|
+
|
|
89
|
+
def parse_sgml_submission(filepath: str, output_dir: str | None = None) -> None:
|
|
90
|
+
if output_dir is None:
|
|
91
|
+
output_dir = os.path.splitext(filepath)[0] + '_output'
|
|
92
|
+
|
|
93
|
+
parser = SimpleSGMLParser()
|
|
94
|
+
parser.parse_file(filepath, output_dir)
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamule
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.418
|
|
4
4
|
Summary: Making it easier to use SEC filings.
|
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
|
6
6
|
Author: John Friedman
|
|
@@ -18,9 +18,9 @@ Requires-Dist: selectolax
|
|
|
18
18
|
Requires-Dist: pytz
|
|
19
19
|
Requires-Dist: zstandard
|
|
20
20
|
Provides-Extra: all
|
|
21
|
+
Requires-Dist: openai; extra == "all"
|
|
21
22
|
Requires-Dist: flask; extra == "all"
|
|
22
23
|
Requires-Dist: psutil; extra == "all"
|
|
23
|
-
Requires-Dist: openai; extra == "all"
|
|
24
24
|
Requires-Dist: google-generativeai; extra == "all"
|
|
25
25
|
Requires-Dist: pandas; extra == "all"
|
|
26
26
|
Provides-Extra: dataset_builder
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datamule/__init__.py,sha256=
|
|
1
|
+
datamule/__init__.py,sha256=Li3iau_u87wQQhoPliSTTpGaf3OMf5jIvqtHFJmCvnw,2338
|
|
2
2
|
datamule/document.py,sha256=6xEaI-32AQiBxX3gZcX4Qr49bgvcvLviFAwUGpTwtr0,5273
|
|
3
3
|
datamule/helper.py,sha256=8HOjB3Y7svw_zjEY-AY5JKOJ-LrBiuQMPyok3MH6CCg,4716
|
|
4
4
|
datamule/monitor.py,sha256=WVds1HGV_ojYgWmo0b4Dsiv9mzZ85HHnCucH-7XoUw8,9350
|
|
@@ -11,6 +11,8 @@ datamule/data/company_tickers.csv,sha256=ihU6aNFriN0lADloCO85Op04deFk3qVcLZ0EJhi
|
|
|
11
11
|
datamule/data/sec-glossary.csv,sha256=TPjTBVM3kyFd8xHsmihykepvKbuLAAthOfEDjh_H-Kk,251937
|
|
12
12
|
datamule/data/xbrl_descriptions.csv,sha256=Hg9BOo9zSjR7Khvx0pikILcbmDK_A404dmQtWuESK4s,2631548
|
|
13
13
|
datamule/dataset_builder/dataset_builder.py,sha256=h1JDzLcMKxxMcXcD24EyqjUPp78iLWCovfNLQtpwZi4,10005
|
|
14
|
+
datamule/downloader/downloader.py,sha256=D95tUD-y7xmk8JatgkzZnyIouH6AMwUF78nlgOY-FM8,14872
|
|
15
|
+
datamule/downloader/premiumdownloader.py,sha256=jLr9jWDUrOQQXagbpbqc9rc2H_od6tDhyKnmNKABids,14537
|
|
14
16
|
datamule/mulebot/__init__.py,sha256=GM5cTnijSSLO9GXFdsCuz5B1iwGUcxDbpoBQ6zw1Odo,30
|
|
15
17
|
datamule/mulebot/helper.py,sha256=Hzzr2HReHpFe2GfpVU79EXvQFx3oL9UiwkJp--Sd1N4,1114
|
|
16
18
|
datamule/mulebot/mulebot.py,sha256=wN0Tv5fvarXgk1LRCcgPhj8Xgd8uYFn-cMucxTxRsEo,5966
|
|
@@ -29,9 +31,25 @@ datamule/mulebot/mulebot_server/static/scripts/suggestions.js,sha256=vqFoHG0z0YI
|
|
|
29
31
|
datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js,sha256=jelibkbM9YlrlU7_-kaVisMh8OjIKWZpJQ9_FiHV4OQ,4565
|
|
30
32
|
datamule/mulebot/mulebot_server/static/scripts/utils.js,sha256=pOzsJ6bGxakN6G5qUvmkZd34u0GRHFxvH3pKpUmnhMc,1282
|
|
31
33
|
datamule/mulebot/mulebot_server/templates/chat-minimalist.html,sha256=mm-LomSoNcJImsFwLpFWt3kIFnxUzIPmUZ1Dt5BFN0s,7065
|
|
34
|
+
datamule/parser/document_parsing/basic_10k_parser.py,sha256=z1ZP8L8ZOfcpo6S3jmS3aQvefOurqS4u6oy9bh1UBHA,3137
|
|
35
|
+
datamule/parser/document_parsing/basic_10q_parser.py,sha256=kZWGljvC0TYLh4eWl4JQyl4XYto8X6NN6YIW-_iffqA,2695
|
|
36
|
+
datamule/parser/document_parsing/basic_13d_parser.py,sha256=1nxBi9KFJuzw7CeEXpJFDzYVUd6rj4eCYiKHsGjK2b4,1585
|
|
37
|
+
datamule/parser/document_parsing/basic_13g_parser.py,sha256=H9_MuOgkYVTIGwhj9w_WIpfMDdgakE_qs0Y3fvaUj_E,1691
|
|
38
|
+
datamule/parser/document_parsing/basic_8k_parser.py,sha256=fzf8q9LOpBMHGWw-sfqUq3pyFZBlw47nLJBQWPhtGGg,2549
|
|
39
|
+
datamule/parser/document_parsing/company_concepts_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
datamule/parser/document_parsing/form_d_parser.py,sha256=NTAfC8W3i2y7aIofXoLlAbY-4F6QVELYfIPIrVErjVY,2105
|
|
41
|
+
datamule/parser/document_parsing/generalized_item_parser.py,sha256=M2bmYivSXe0POyBtDlPMykyyCgG8n1egRpJuZtZTR_g,2694
|
|
42
|
+
datamule/parser/document_parsing/generalized_xml_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
datamule/parser/document_parsing/helper.py,sha256=wEhqx70CQ0SyEfmOfSCcNsf2TNTtomZvTbmoOhwG3kk,2607
|
|
44
|
+
datamule/parser/document_parsing/information_table_parser_13fhr.py,sha256=vCIEqxOoBf1lnOKTJAifnPEXihtSPZcwRFW8yWhEZLc,1808
|
|
45
|
+
datamule/parser/document_parsing/insider_trading_parser.py,sha256=IrH5a3qikPjC1GFoYzapfYW_CArbK0FkbJLbWSfsYNQ,7264
|
|
46
|
+
datamule/parser/document_parsing/mappings.py,sha256=dq6EjaxxDHjH-sg62adRwJOf1v736QiLwXavOHs2vy8,5380
|
|
47
|
+
datamule/parser/document_parsing/n_port_p_parser.py,sha256=T6GliMm-TETPsFM-hDKt1BkMnSenvDeThj50RsIip50,2293
|
|
48
|
+
datamule/parser/document_parsing/sec_parser.py,sha256=YewOdOsi0P25teQuxS5DNEND9ZCyxE2ewK1DoP9mPto,2788
|
|
49
|
+
datamule/parser/document_parsing/sgml_parser.py,sha256=ASpe1SzgPj4qk0VOmmuMiEQeatjcwZzsuO3MvsYCHhc,3410
|
|
32
50
|
datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=vxLnjpUgZ5LLvBvzYI_CZVxjpgRpulnzj3EFQG5eB8g,797203
|
|
33
|
-
datamule/parser/sgml_parsing/sgml_parser_cy.cp39-win_amd64.pyd,sha256=
|
|
34
|
-
datamule-0.
|
|
35
|
-
datamule-0.
|
|
36
|
-
datamule-0.
|
|
37
|
-
datamule-0.
|
|
51
|
+
datamule/parser/sgml_parsing/sgml_parser_cy.cp39-win_amd64.pyd,sha256=Ts6VLobOSehNoSITBlRx0iisf5uGWuzxyJ8ljw54Fvk,142848
|
|
52
|
+
datamule-0.418.dist-info/METADATA,sha256=H3rV4RWQImJgtSxpaW0-dGnnR7-mbl-_5iUKNQIv49k,1087
|
|
53
|
+
datamule-0.418.dist-info/WHEEL,sha256=yA7mxgqX2UV73NtJdMh2AAmdb628loM81912H3s5r00,100
|
|
54
|
+
datamule-0.418.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
|
55
|
+
datamule-0.418.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|