datamule 0.416__cp310-cp310-macosx_10_9_universal2.whl → 0.418__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

@@ -0,0 +1,94 @@
1
+ import os
2
+ import json
3
+ import uu
4
+ from io import BytesIO
5
+
6
+ class SimpleSGMLParser:
7
+ def _extract_tag_content(self, line: str) -> tuple[str, str] | None:
8
+ if not (line.startswith('<') and '>' in line):
9
+ return None
10
+
11
+ tag_end = line.index('>')
12
+ tag = line[1:tag_end]
13
+
14
+ if tag.startswith('/'):
15
+ return None
16
+
17
+ content = line[tag_end + 1:].strip()
18
+ return (tag, content)
19
+
20
+ def _write_document(self, content: str, document_info: dict, output_dir: str) -> None:
21
+ if not content:
22
+ return
23
+
24
+ output_path = os.path.join(output_dir, document_info.get('FILENAME', f"{document_info.get('SEQUENCE', 'unknown')}.txt"))
25
+
26
+ first_line = content.partition('\n')[0].strip()
27
+ if first_line.startswith('begin '):
28
+ with BytesIO(content.encode()) as input_file:
29
+ uu.decode(input_file, output_path,quiet=True)
30
+ else:
31
+ with open(output_path, 'w', encoding='utf-8') as f:
32
+ f.write(content)
33
+
34
+ def parse_file(self, filepath: str, output_dir: str) -> None:
35
+ os.makedirs(output_dir, exist_ok=True)
36
+
37
+ submission_data = {}
38
+ documents = []
39
+ current_document = {}
40
+ text_buffer = []
41
+
42
+ in_document = False
43
+ in_text = False
44
+ in_submission = True
45
+
46
+ with open(filepath, 'r', encoding='utf-8') as file:
47
+ for line in file:
48
+ stripped = line.strip()
49
+
50
+ if stripped == '<DOCUMENT>':
51
+ in_document = True
52
+ in_submission = False
53
+ current_document = {}
54
+
55
+ elif stripped == '</DOCUMENT>':
56
+ documents.append(current_document)
57
+ self._write_document(''.join(text_buffer), current_document, output_dir)
58
+ text_buffer = []
59
+ in_document = False
60
+
61
+ elif stripped == '<TEXT>':
62
+ in_text = True
63
+ text_buffer = []
64
+
65
+ elif stripped == '</TEXT>':
66
+ in_text = False
67
+
68
+ elif in_text:
69
+ if stripped not in ['<PDF>', '</PDF>']:
70
+ text_buffer.append(line)
71
+
72
+ else:
73
+ tag_content = self._extract_tag_content(stripped)
74
+ if tag_content:
75
+ key, value = tag_content
76
+ if in_submission:
77
+ submission_data[key] = value
78
+ elif in_document:
79
+ current_document[key] = value
80
+
81
+ metadata = {
82
+ 'submission': submission_data,
83
+ 'documents': documents
84
+ }
85
+
86
+ with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
87
+ json.dump(metadata, f, indent=4)
88
+
89
+ def parse_sgml_submission(filepath: str, output_dir: str | None = None) -> None:
90
+ if output_dir is None:
91
+ output_dir = os.path.splitext(filepath)[0] + '_output'
92
+
93
+ parser = SimpleSGMLParser()
94
+ parser.parse_file(filepath, output_dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.416
3
+ Version: 0.418
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -16,11 +16,11 @@ Requires-Dist: selectolax
16
16
  Requires-Dist: pytz
17
17
  Requires-Dist: zstandard
18
18
  Provides-Extra: all
19
- Requires-Dist: pandas; extra == "all"
20
19
  Requires-Dist: openai; extra == "all"
21
- Requires-Dist: google-generativeai; extra == "all"
22
- Requires-Dist: psutil; extra == "all"
23
20
  Requires-Dist: flask; extra == "all"
21
+ Requires-Dist: psutil; extra == "all"
22
+ Requires-Dist: google-generativeai; extra == "all"
23
+ Requires-Dist: pandas; extra == "all"
24
24
  Provides-Extra: dataset_builder
25
25
  Requires-Dist: pandas; extra == "dataset-builder"
26
26
  Requires-Dist: google-generativeai; extra == "dataset-builder"
@@ -1,4 +1,4 @@
1
- datamule/__init__.py,sha256=8zx1hqAJeTxqmJ43zjJpJl8SRIRBx1zj8RNrsvYrQhE,1972
1
+ datamule/__init__.py,sha256=i3HnWFrqEdsK2OmvE7NUjcO05w1BuIYBV6J4cSFaN3s,2268
2
2
  datamule/document.py,sha256=Yn8UqUjKwYPE29MrMjreHK_HY9eTqOSjPyM5B1VBrHQ,5144
3
3
  datamule/helper.py,sha256=tr3AQWus9dHNZFKpLSglWjcb8zmm5qDXjOWACMhvMxQ,4594
4
4
  datamule/monitor.py,sha256=mRaM8v5NgcMF9DJ1s_YBzucjrbr-3yFwW422MVml-_Q,9114
@@ -11,6 +11,8 @@ datamule/data/company_tickers.csv,sha256=GW6lOP54RiGJCx-d9N5jEBy7tGVgU3zI-5xHJXr
11
11
  datamule/data/sec-glossary.csv,sha256=-cN7GjiadLw5C1sv4zSeCnfeZZDYeSgJl-0ydarMAo0,251209
12
12
  datamule/data/xbrl_descriptions.csv,sha256=SQ9wUURNqG424rnTiZtopsxV2q-PvU4NMj52LqgDsvg,2621524
13
13
  datamule/dataset_builder/dataset_builder.py,sha256=NCvNbDwlEkA_eAbqbsG--YlqPBDREFTVSM1GJquR0RE,9747
14
+ datamule/downloader/downloader.py,sha256=XJF0FfoCB43_22lhPakq9dI-oJ_SsXIvNx4PUVmcEOc,14509
15
+ datamule/downloader/premiumdownloader.py,sha256=JH4aZ-ZwARCIACKwgzSgHAuOkKPc_GnhiUHSSu22XO4,14206
14
16
  datamule/mulebot/__init__.py,sha256=YvZXV6xQ0iP-oGD8rloufjdwJL6D46P3NNr0CY9PQCA,29
15
17
  datamule/mulebot/helper.py,sha256=olztOwltfELZ-IERM2bRNLBavD04kfB6ueWTisJAleA,1080
16
18
  datamule/mulebot/mulebot.py,sha256=XbtgvXBSFu9OaaLW_k1KDgHVTNQGV8_0ZwNMFad-pPU,5837
@@ -29,9 +31,25 @@ datamule/mulebot/mulebot_server/static/scripts/suggestions.js,sha256=TCyz8OYuXeI
29
31
  datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js,sha256=UtkUpLvELNI4Ibpb7VstgVA9Tk-8jbkxXhmXsgufFa4,4437
30
32
  datamule/mulebot/mulebot_server/static/scripts/utils.js,sha256=oGPMtyT9dvuqHqrfZj33t4vLZiF8UJrMXB1hpPXRNu4,1255
31
33
  datamule/mulebot/mulebot_server/templates/chat-minimalist.html,sha256=MsTbgpnLD0JCQiKKP3XeeNJRNsRqKsRa1j_XXW7nBKw,6975
34
+ datamule/parser/document_parsing/basic_10k_parser.py,sha256=-_q0X9K4KyLccF6j_zNp7FknGPBW1r4U3AT9bPjQUgA,3056
35
+ datamule/parser/document_parsing/basic_10q_parser.py,sha256=ccQc3pwBqevDb6-vBwEE5RTZwRcnrSxjRxZEk_zPO-s,2623
36
+ datamule/parser/document_parsing/basic_13d_parser.py,sha256=loJC97H_ccu_hWMhgNt5tvGZnN3--7tsqZxzBnWB_FY,1528
37
+ datamule/parser/document_parsing/basic_13g_parser.py,sha256=sWg83-QTAzUDNs45iWtpxnMxQgtC3zJlFj0R9ybZpNI,1631
38
+ datamule/parser/document_parsing/basic_8k_parser.py,sha256=inCSmlH_BkLK0Lkvt0kZ6EUJ0nijul_RkdXzccyOmRI,2466
39
+ datamule/parser/document_parsing/company_concepts_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ datamule/parser/document_parsing/form_d_parser.py,sha256=dWlGeVZRzh0kfT3gVMC8eyqeQORdVV3r8KXUwEqAW3s,2036
41
+ datamule/parser/document_parsing/generalized_item_parser.py,sha256=67_DFb1BQbMmdHefEgoCPlEoiUT0zyxh3eBNJpjGXUk,2616
42
+ datamule/parser/document_parsing/generalized_xml_parser.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
+ datamule/parser/document_parsing/helper.py,sha256=QPhVxLxMSx6Qdi7sR4D4iPObGoTnVD3tXTCNWzNxStg,2533
44
+ datamule/parser/document_parsing/information_table_parser_13fhr.py,sha256=R4Up1oDx3xAlzHwXzVzUkdOSsk8YPuJBPS_3I_bNQSE,1767
45
+ datamule/parser/document_parsing/insider_trading_parser.py,sha256=OVQDeLcfaZtgmOWvWPDotftO6jxx-doFAqBYVqNgypo,7106
46
+ datamule/parser/document_parsing/mappings.py,sha256=VKdnT3C5yPTbB4ZBa4El4jnB-6_osomm2rbJx6Ac6HE,5286
47
+ datamule/parser/document_parsing/n_port_p_parser.py,sha256=GmmQFkCZt57WikUZ5DahtTYMhhk0VcfkhOJusM4Tkow,2224
48
+ datamule/parser/document_parsing/sec_parser.py,sha256=AS8H4h1sfUAdWP2gotULcjbylsYN_nHgTfkeVRyENPo,2716
49
+ datamule/parser/document_parsing/sgml_parser.py,sha256=tC1cL3cdVQPWbc9QtoRUYSo2wRuYNaglFaCmP57oEfA,3317
32
50
  datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=UwXSuLGSBLKfO5bM7xTzjFLnGDV3-NNnCjgUCPAY1gk,796689
33
- datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so,sha256=Zs-4Wm-pN-4ddHLBniegMkO2WbCsiJF09hIMcLZbSQU,362232
34
- datamule-0.416.dist-info/METADATA,sha256=BLgydCvASmb5OR6jlTOoTH06YFx3LH7ygfaZCK7UH2Y,1007
35
- datamule-0.416.dist-info/WHEEL,sha256=Sooupui5EnBW-HNXjQU0OJoerPN0TT-a4xxlvmpUy2g,115
36
- datamule-0.416.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
37
- datamule-0.416.dist-info/RECORD,,
51
+ datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so,sha256=r7PGnO6cbHnXO1BwDGGuOX6m9qyNtJcPeYpFm1M8t3o,362232
52
+ datamule-0.418.dist-info/METADATA,sha256=3s-xzrQ7TOuColC022WoYWoC2IOntbvhhhqlztGWoB8,1007
53
+ datamule-0.418.dist-info/WHEEL,sha256=Sooupui5EnBW-HNXjQU0OJoerPN0TT-a4xxlvmpUy2g,115
54
+ datamule-0.418.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
55
+ datamule-0.418.dist-info/RECORD,,