datamule 0.416__cp310-cp310-win_amd64.whl → 0.418__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

@@ -0,0 +1,94 @@
1
+ import os
2
+ import json
3
+ import uu
4
+ from io import BytesIO
5
+
6
+ class SimpleSGMLParser:
7
+ def _extract_tag_content(self, line: str) -> tuple[str, str] | None:
8
+ if not (line.startswith('<') and '>' in line):
9
+ return None
10
+
11
+ tag_end = line.index('>')
12
+ tag = line[1:tag_end]
13
+
14
+ if tag.startswith('/'):
15
+ return None
16
+
17
+ content = line[tag_end + 1:].strip()
18
+ return (tag, content)
19
+
20
+ def _write_document(self, content: str, document_info: dict, output_dir: str) -> None:
21
+ if not content:
22
+ return
23
+
24
+ output_path = os.path.join(output_dir, document_info.get('FILENAME', f"{document_info.get('SEQUENCE', 'unknown')}.txt"))
25
+
26
+ first_line = content.partition('\n')[0].strip()
27
+ if first_line.startswith('begin '):
28
+ with BytesIO(content.encode()) as input_file:
29
+ uu.decode(input_file, output_path,quiet=True)
30
+ else:
31
+ with open(output_path, 'w', encoding='utf-8') as f:
32
+ f.write(content)
33
+
34
+ def parse_file(self, filepath: str, output_dir: str) -> None:
35
+ os.makedirs(output_dir, exist_ok=True)
36
+
37
+ submission_data = {}
38
+ documents = []
39
+ current_document = {}
40
+ text_buffer = []
41
+
42
+ in_document = False
43
+ in_text = False
44
+ in_submission = True
45
+
46
+ with open(filepath, 'r', encoding='utf-8') as file:
47
+ for line in file:
48
+ stripped = line.strip()
49
+
50
+ if stripped == '<DOCUMENT>':
51
+ in_document = True
52
+ in_submission = False
53
+ current_document = {}
54
+
55
+ elif stripped == '</DOCUMENT>':
56
+ documents.append(current_document)
57
+ self._write_document(''.join(text_buffer), current_document, output_dir)
58
+ text_buffer = []
59
+ in_document = False
60
+
61
+ elif stripped == '<TEXT>':
62
+ in_text = True
63
+ text_buffer = []
64
+
65
+ elif stripped == '</TEXT>':
66
+ in_text = False
67
+
68
+ elif in_text:
69
+ if stripped not in ['<PDF>', '</PDF>']:
70
+ text_buffer.append(line)
71
+
72
+ else:
73
+ tag_content = self._extract_tag_content(stripped)
74
+ if tag_content:
75
+ key, value = tag_content
76
+ if in_submission:
77
+ submission_data[key] = value
78
+ elif in_document:
79
+ current_document[key] = value
80
+
81
+ metadata = {
82
+ 'submission': submission_data,
83
+ 'documents': documents
84
+ }
85
+
86
+ with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
87
+ json.dump(metadata, f, indent=4)
88
+
89
+ def parse_sgml_submission(filepath: str, output_dir: str | None = None) -> None:
90
+ if output_dir is None:
91
+ output_dir = os.path.splitext(filepath)[0] + '_output'
92
+
93
+ parser = SimpleSGMLParser()
94
+ parser.parse_file(filepath, output_dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.416
3
+ Version: 0.418
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -16,11 +16,11 @@ Requires-Dist: selectolax
16
16
  Requires-Dist: pytz
17
17
  Requires-Dist: zstandard
18
18
  Provides-Extra: all
19
+ Requires-Dist: flask; extra == "all"
20
+ Requires-Dist: psutil; extra == "all"
19
21
  Requires-Dist: pandas; extra == "all"
20
22
  Requires-Dist: openai; extra == "all"
21
- Requires-Dist: psutil; extra == "all"
22
23
  Requires-Dist: google-generativeai; extra == "all"
23
- Requires-Dist: flask; extra == "all"
24
24
  Provides-Extra: dataset_builder
25
25
  Requires-Dist: pandas; extra == "dataset-builder"
26
26
  Requires-Dist: google-generativeai; extra == "dataset-builder"
@@ -1,4 +1,4 @@
1
- datamule/__init__.py,sha256=c5Ozl19hzvyMqG75Zl3XFHIBQihxfB47uoTm7EfOOTQ,2033
1
+ datamule/__init__.py,sha256=Li3iau_u87wQQhoPliSTTpGaf3OMf5jIvqtHFJmCvnw,2338
2
2
  datamule/document.py,sha256=6xEaI-32AQiBxX3gZcX4Qr49bgvcvLviFAwUGpTwtr0,5273
3
3
  datamule/helper.py,sha256=8HOjB3Y7svw_zjEY-AY5JKOJ-LrBiuQMPyok3MH6CCg,4716
4
4
  datamule/monitor.py,sha256=WVds1HGV_ojYgWmo0b4Dsiv9mzZ85HHnCucH-7XoUw8,9350
@@ -11,6 +11,8 @@ datamule/data/company_tickers.csv,sha256=ihU6aNFriN0lADloCO85Op04deFk3qVcLZ0EJhi
11
11
  datamule/data/sec-glossary.csv,sha256=TPjTBVM3kyFd8xHsmihykepvKbuLAAthOfEDjh_H-Kk,251937
12
12
  datamule/data/xbrl_descriptions.csv,sha256=Hg9BOo9zSjR7Khvx0pikILcbmDK_A404dmQtWuESK4s,2631548
13
13
  datamule/dataset_builder/dataset_builder.py,sha256=h1JDzLcMKxxMcXcD24EyqjUPp78iLWCovfNLQtpwZi4,10005
14
+ datamule/downloader/downloader.py,sha256=D95tUD-y7xmk8JatgkzZnyIouH6AMwUF78nlgOY-FM8,14872
15
+ datamule/downloader/premiumdownloader.py,sha256=jLr9jWDUrOQQXagbpbqc9rc2H_od6tDhyKnmNKABids,14537
14
16
  datamule/mulebot/__init__.py,sha256=GM5cTnijSSLO9GXFdsCuz5B1iwGUcxDbpoBQ6zw1Odo,30
15
17
  datamule/mulebot/helper.py,sha256=Hzzr2HReHpFe2GfpVU79EXvQFx3oL9UiwkJp--Sd1N4,1114
16
18
  datamule/mulebot/mulebot.py,sha256=wN0Tv5fvarXgk1LRCcgPhj8Xgd8uYFn-cMucxTxRsEo,5966
@@ -29,9 +31,25 @@ datamule/mulebot/mulebot_server/static/scripts/suggestions.js,sha256=vqFoHG0z0YI
29
31
  datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js,sha256=jelibkbM9YlrlU7_-kaVisMh8OjIKWZpJQ9_FiHV4OQ,4565
30
32
  datamule/mulebot/mulebot_server/static/scripts/utils.js,sha256=pOzsJ6bGxakN6G5qUvmkZd34u0GRHFxvH3pKpUmnhMc,1282
31
33
  datamule/mulebot/mulebot_server/templates/chat-minimalist.html,sha256=mm-LomSoNcJImsFwLpFWt3kIFnxUzIPmUZ1Dt5BFN0s,7065
34
+ datamule/parser/document_parsing/basic_10k_parser.py,sha256=z1ZP8L8ZOfcpo6S3jmS3aQvefOurqS4u6oy9bh1UBHA,3137
35
+ datamule/parser/document_parsing/basic_10q_parser.py,sha256=kZWGljvC0TYLh4eWl4JQyl4XYto8X6NN6YIW-_iffqA,2695
36
+ datamule/parser/document_parsing/basic_13d_parser.py,sha256=1nxBi9KFJuzw7CeEXpJFDzYVUd6rj4eCYiKHsGjK2b4,1585
37
+ datamule/parser/document_parsing/basic_13g_parser.py,sha256=H9_MuOgkYVTIGwhj9w_WIpfMDdgakE_qs0Y3fvaUj_E,1691
38
+ datamule/parser/document_parsing/basic_8k_parser.py,sha256=fzf8q9LOpBMHGWw-sfqUq3pyFZBlw47nLJBQWPhtGGg,2549
39
+ datamule/parser/document_parsing/company_concepts_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ datamule/parser/document_parsing/form_d_parser.py,sha256=NTAfC8W3i2y7aIofXoLlAbY-4F6QVELYfIPIrVErjVY,2105
41
+ datamule/parser/document_parsing/generalized_item_parser.py,sha256=M2bmYivSXe0POyBtDlPMykyyCgG8n1egRpJuZtZTR_g,2694
42
+ datamule/parser/document_parsing/generalized_xml_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
+ datamule/parser/document_parsing/helper.py,sha256=wEhqx70CQ0SyEfmOfSCcNsf2TNTtomZvTbmoOhwG3kk,2607
44
+ datamule/parser/document_parsing/information_table_parser_13fhr.py,sha256=vCIEqxOoBf1lnOKTJAifnPEXihtSPZcwRFW8yWhEZLc,1808
45
+ datamule/parser/document_parsing/insider_trading_parser.py,sha256=IrH5a3qikPjC1GFoYzapfYW_CArbK0FkbJLbWSfsYNQ,7264
46
+ datamule/parser/document_parsing/mappings.py,sha256=dq6EjaxxDHjH-sg62adRwJOf1v736QiLwXavOHs2vy8,5380
47
+ datamule/parser/document_parsing/n_port_p_parser.py,sha256=T6GliMm-TETPsFM-hDKt1BkMnSenvDeThj50RsIip50,2293
48
+ datamule/parser/document_parsing/sec_parser.py,sha256=YewOdOsi0P25teQuxS5DNEND9ZCyxE2ewK1DoP9mPto,2788
49
+ datamule/parser/document_parsing/sgml_parser.py,sha256=ASpe1SzgPj4qk0VOmmuMiEQeatjcwZzsuO3MvsYCHhc,3410
32
50
  datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=vxLnjpUgZ5LLvBvzYI_CZVxjpgRpulnzj3EFQG5eB8g,797203
33
- datamule/parser/sgml_parsing/sgml_parser_cy.cp310-win_amd64.pyd,sha256=BKQvdSfaD2SQ8QrdQLoK_pv9HTVw5rhZcsz2RF6gJhA,121856
34
- datamule-0.416.dist-info/METADATA,sha256=1n3QhNG_VX2_lQ1yNbJBVGdB3pi73eaLQb34RVTB0Kg,1039
35
- datamule-0.416.dist-info/WHEEL,sha256=NVXpD7b4Gxps0cd2ds5rr5TG8W4ApEwx_i5J99qMZ5E,102
36
- datamule-0.416.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
37
- datamule-0.416.dist-info/RECORD,,
51
+ datamule/parser/sgml_parsing/sgml_parser_cy.cp310-win_amd64.pyd,sha256=8JvbOWZgTRVUllMBRjghOu_OYQWGF56-QYAU-h2DtUM,121856
52
+ datamule-0.418.dist-info/METADATA,sha256=_JAZVOyjkRtg8m8A6i8DjspRv7mLrUtsHdSw02cxc4w,1039
53
+ datamule-0.418.dist-info/WHEEL,sha256=NVXpD7b4Gxps0cd2ds5rr5TG8W4ApEwx_i5J99qMZ5E,102
54
+ datamule-0.418.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
55
+ datamule-0.418.dist-info/RECORD,,