datamule 0.416__cp311-cp311-win_amd64.whl → 0.417__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- datamule/downloader/downloader.py +364 -0
- datamule/downloader/premiumdownloader.py +332 -0
- datamule/parser/document_parsing/basic_10k_parser.py +82 -0
- datamule/parser/document_parsing/basic_10q_parser.py +73 -0
- datamule/parser/document_parsing/basic_13d_parser.py +58 -0
- datamule/parser/document_parsing/basic_13g_parser.py +61 -0
- datamule/parser/document_parsing/basic_8k_parser.py +84 -0
- datamule/parser/document_parsing/company_concepts_parser.py +0 -0
- datamule/parser/document_parsing/form_d_parser.py +70 -0
- datamule/parser/document_parsing/generalized_item_parser.py +78 -0
- datamule/parser/document_parsing/generalized_xml_parser.py +0 -0
- datamule/parser/document_parsing/helper.py +75 -0
- datamule/parser/document_parsing/information_table_parser_13fhr.py +41 -0
- datamule/parser/document_parsing/insider_trading_parser.py +158 -0
- datamule/parser/document_parsing/mappings.py +95 -0
- datamule/parser/document_parsing/n_port_p_parser.py +70 -0
- datamule/parser/document_parsing/sec_parser.py +73 -0
- datamule/parser/document_parsing/sgml_parser.py +94 -0
- datamule/parser/sgml_parsing/sgml_parser_cy.cp311-win_amd64.pyd +0 -0
- {datamule-0.416.dist-info → datamule-0.417.dist-info}/METADATA +3 -3
- {datamule-0.416.dist-info → datamule-0.417.dist-info}/RECORD +23 -5
- {datamule-0.416.dist-info → datamule-0.417.dist-info}/WHEEL +0 -0
- {datamule-0.416.dist-info → datamule-0.417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import re
|
|
3
|
+
from .helper import load_file_content, clean_title
|
|
4
|
+
|
|
5
|
+
PART_PATTERN = re.compile(r'\n\s*part[.:)?\s]+([IVX]+|\d+)', re.I)
|
|
6
|
+
ITEM_PATTERN = re.compile(r'\n\s*item[.:)?\s]+(\d+[A-Z]?)', re.I)
|
|
7
|
+
IS_10K_PATTERN = re.compile(r'item[.:)?\s]+14', re.I)
|
|
8
|
+
TOC_END_PATTERN = re.compile(r'(?:item[.:)?\s]+14).*?(?=\n\s*item[.:)?\s]+1\b)', re.I | re.DOTALL)
|
|
9
|
+
|
|
10
|
+
ROMAN_TO_NUM = {'I': '1', 'II': '2', 'III': '3', 'IV': '4'}
|
|
11
|
+
|
|
12
|
+
ITEM_TO_PART = {
|
|
13
|
+
'1': 'I', '1A': 'I', '1B': 'I', '1C': 'I', '2': 'I', '3': 'I', '4': 'I',
|
|
14
|
+
'5': 'II', '6': 'II', '7': 'II', '7A': 'II', '8': 'II', '9': 'II', '9A': 'II', '9B': 'II', '9C': 'II',
|
|
15
|
+
'10': 'III', '11': 'III', '12': 'III', '13': 'III', '14': 'III',
|
|
16
|
+
'15': 'IV', '16': 'IV', '16A': 'IV'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def find_content_start(content):
|
|
20
|
+
toc_match = TOC_END_PATTERN.search(content)
|
|
21
|
+
if toc_match:
|
|
22
|
+
item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
|
|
23
|
+
item_1_match = item_1_pattern.search(content, toc_match.end())
|
|
24
|
+
if item_1_match:
|
|
25
|
+
return item_1_match.start()
|
|
26
|
+
return 0
|
|
27
|
+
|
|
28
|
+
def find_anchors(content):
|
|
29
|
+
start_pos = find_content_start(content)
|
|
30
|
+
content = '\n' + content[start_pos:]
|
|
31
|
+
|
|
32
|
+
anchors = []
|
|
33
|
+
for part_match in PART_PATTERN.finditer(content):
|
|
34
|
+
anchors.append(('part', part_match.group(1), part_match.start() + start_pos, part_match.group()))
|
|
35
|
+
|
|
36
|
+
for item_match in ITEM_PATTERN.finditer(content):
|
|
37
|
+
anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
|
|
38
|
+
|
|
39
|
+
return sorted(anchors, key=lambda x: x[2])
|
|
40
|
+
|
|
41
|
+
def extract_sections(content, anchors, filename):
|
|
42
|
+
if not anchors:
|
|
43
|
+
return {}
|
|
44
|
+
|
|
45
|
+
result = {
|
|
46
|
+
"metadata": {"document_name": Path(filename).stem},
|
|
47
|
+
"document": {
|
|
48
|
+
"part1": {}, "part2": {}, "part3": {}, "part4": {}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
last_item = None
|
|
53
|
+
current_text = None
|
|
54
|
+
|
|
55
|
+
for i, current in enumerate(anchors):
|
|
56
|
+
if current[0] == 'item':
|
|
57
|
+
next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
|
|
58
|
+
text = content[current[2]:next_pos].strip()
|
|
59
|
+
|
|
60
|
+
if current[1] == last_item:
|
|
61
|
+
current_text += "\n\n" + text
|
|
62
|
+
else:
|
|
63
|
+
if last_item and last_item in ITEM_TO_PART:
|
|
64
|
+
part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
|
|
65
|
+
result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
|
|
66
|
+
current_text = text
|
|
67
|
+
last_item = current[1]
|
|
68
|
+
|
|
69
|
+
if last_item and last_item in ITEM_TO_PART:
|
|
70
|
+
part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
|
|
71
|
+
result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
|
|
72
|
+
|
|
73
|
+
# Only keep non-empty parts
|
|
74
|
+
result["document"] = {k:v for k,v in result["document"].items() if v}
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
def parse_10k(filename):
|
|
78
|
+
content = load_file_content(filename)
|
|
79
|
+
if not IS_10K_PATTERN.search(content):
|
|
80
|
+
return {}
|
|
81
|
+
anchors = find_anchors(content)
|
|
82
|
+
return extract_sections(content, anchors, filename)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from .helper import load_file_content, clean_title
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
PART_II_PATTERN = re.compile(r'\n\s*part\s+II\.?(?:[:\s\.]|$)', re.I)
|
|
6
|
+
ITEM_PATTERN = re.compile(r'\n\s*item\s+(\d+[A-Z]?)\.?(?:[:\s\.]|$)', re.I)
|
|
7
|
+
TOC_END_PATTERN = re.compile(r'(?:item\s*6\.?).*?(?=\n\s*item\s*1\.?\b)', re.I | re.DOTALL)
|
|
8
|
+
|
|
9
|
+
def find_content_start(content):
|
|
10
|
+
toc_match = TOC_END_PATTERN.search(content)
|
|
11
|
+
if toc_match:
|
|
12
|
+
item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
|
|
13
|
+
item_1_match = item_1_pattern.search(content, toc_match.end())
|
|
14
|
+
if item_1_match:
|
|
15
|
+
return item_1_match.start()
|
|
16
|
+
return 0
|
|
17
|
+
|
|
18
|
+
def find_anchors(content):
|
|
19
|
+
start_pos = find_content_start(content)
|
|
20
|
+
content = '\n' + content[start_pos:]
|
|
21
|
+
|
|
22
|
+
part_ii_match = PART_II_PATTERN.search(content)
|
|
23
|
+
part_ii_pos = part_ii_match.start() + start_pos if part_ii_match else None
|
|
24
|
+
|
|
25
|
+
anchors = []
|
|
26
|
+
for item_match in ITEM_PATTERN.finditer(content):
|
|
27
|
+
anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
|
|
28
|
+
|
|
29
|
+
return sorted(anchors, key=lambda x: x[2]), part_ii_pos
|
|
30
|
+
|
|
31
|
+
def extract_sections(content, anchors_and_part2, filename):
|
|
32
|
+
anchors, part2_pos = anchors_and_part2
|
|
33
|
+
if not anchors:
|
|
34
|
+
return {}
|
|
35
|
+
|
|
36
|
+
result = {
|
|
37
|
+
"metadata": {"document_name": Path(filename).stem},
|
|
38
|
+
"document": {
|
|
39
|
+
"part1": {},
|
|
40
|
+
"part2": {}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
last_item = None
|
|
45
|
+
current_text = None
|
|
46
|
+
last_pos = None
|
|
47
|
+
|
|
48
|
+
for i, current in enumerate(anchors):
|
|
49
|
+
next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
|
|
50
|
+
|
|
51
|
+
if current[1] == last_item:
|
|
52
|
+
current_text += "\n\n" + content[current[2]:next_pos].strip()
|
|
53
|
+
else:
|
|
54
|
+
if last_item is not None:
|
|
55
|
+
part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
|
|
56
|
+
result["document"][part_key][f"item{last_item.lower()}"] = current_text
|
|
57
|
+
|
|
58
|
+
current_text = content[current[2]:next_pos].strip()
|
|
59
|
+
last_item = current[1]
|
|
60
|
+
last_pos = current[2]
|
|
61
|
+
|
|
62
|
+
if last_item is not None:
|
|
63
|
+
part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
|
|
64
|
+
result["document"][part_key][f"item{last_item.lower()}"] = current_text
|
|
65
|
+
|
|
66
|
+
# Clean empty parts
|
|
67
|
+
result["document"] = {k:v for k,v in result["document"].items() if v}
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
def parse_10q(filename):
|
|
71
|
+
content = load_file_content(filename)
|
|
72
|
+
anchors_and_part2 = find_anchors(content)
|
|
73
|
+
return extract_sections(content, anchors_and_part2, filename)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .helper import load_file_content, clean_title
|
|
4
|
+
|
|
5
|
+
ITEM_PATTERN = re.compile(
|
|
6
|
+
r"(?:^[ \t]*)"
|
|
7
|
+
r"(?:"
|
|
8
|
+
r"(?:Item|ITEM)\s*"
|
|
9
|
+
r"(?:"
|
|
10
|
+
r"1|"
|
|
11
|
+
r"2|"
|
|
12
|
+
r"3|"
|
|
13
|
+
r"4|"
|
|
14
|
+
r"5|"
|
|
15
|
+
r"6|"
|
|
16
|
+
r"7|"
|
|
17
|
+
r"8|"
|
|
18
|
+
r"9"
|
|
19
|
+
r")"
|
|
20
|
+
r"|"
|
|
21
|
+
r"SIGNATURES?"
|
|
22
|
+
r")",
|
|
23
|
+
re.IGNORECASE | re.MULTILINE
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def parse_13d(filename: Path) -> dict:
|
|
27
|
+
text = load_file_content(filename)
|
|
28
|
+
matches = [(clean_title(m.group().strip()), m.start()) for m in ITEM_PATTERN.finditer(text)]
|
|
29
|
+
|
|
30
|
+
result = {
|
|
31
|
+
"metadata": {"document_name": Path(filename).stem},
|
|
32
|
+
"document": {}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if not matches:
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
for i, (current_match, start_pos) in enumerate(matches[:-1]):
|
|
39
|
+
section_text = WHITESPACE_PATTERN.sub(' ', text[start_pos:matches[i + 1][1]]).strip()
|
|
40
|
+
if section_text:
|
|
41
|
+
if "signature" in current_match.lower():
|
|
42
|
+
key = "signatures"
|
|
43
|
+
else:
|
|
44
|
+
key = f"item{current_match.lower().replace('item', '').strip()}"
|
|
45
|
+
result["document"][key] = section_text
|
|
46
|
+
|
|
47
|
+
last_match, last_pos = matches[-1]
|
|
48
|
+
section_text = WHITESPACE_PATTERN.sub(' ', text[last_pos:len(text)]).strip()
|
|
49
|
+
if section_text:
|
|
50
|
+
if "signature" in last_match.lower():
|
|
51
|
+
key = "signatures"
|
|
52
|
+
else:
|
|
53
|
+
key = f"item{last_match.lower().replace('item', '').strip()}"
|
|
54
|
+
result["document"][key] = section_text
|
|
55
|
+
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
WHITESPACE_PATTERN = re.compile(r'\s+')
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .helper import load_file_content, clean_title
|
|
4
|
+
|
|
5
|
+
ITEM_PATTERN_13G = re.compile(
|
|
6
|
+
r"(?:^[ \t]*)"
|
|
7
|
+
r"(?:"
|
|
8
|
+
r"(?:Item|ITEM)\s*"
|
|
9
|
+
r"(?:"
|
|
10
|
+
r"10|" # Move 10 to the start so it's matched before 1
|
|
11
|
+
r"11|" # Similarly with 11 and 12
|
|
12
|
+
r"12|"
|
|
13
|
+
r"1|"
|
|
14
|
+
r"2|"
|
|
15
|
+
r"3|"
|
|
16
|
+
r"4|"
|
|
17
|
+
r"5|"
|
|
18
|
+
r"6|"
|
|
19
|
+
r"7|"
|
|
20
|
+
r"8|"
|
|
21
|
+
r"9"
|
|
22
|
+
r")"
|
|
23
|
+
r"|"
|
|
24
|
+
r"SIGNATURES?"
|
|
25
|
+
r")",
|
|
26
|
+
re.IGNORECASE | re.MULTILINE
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def parse_13g(filename: Path) -> dict:
|
|
30
|
+
text = load_file_content(filename)
|
|
31
|
+
matches = [(clean_title(m.group().strip()), m.start()) for m in ITEM_PATTERN_13G.finditer(text)]
|
|
32
|
+
|
|
33
|
+
result = {
|
|
34
|
+
"metadata": {"document_name": Path(filename).stem},
|
|
35
|
+
"document": {}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if not matches:
|
|
39
|
+
return result
|
|
40
|
+
|
|
41
|
+
for i, (current_match, start_pos) in enumerate(matches[:-1]):
|
|
42
|
+
section_text = WHITESPACE_PATTERN.sub(' ', text[start_pos:matches[i + 1][1]]).strip()
|
|
43
|
+
if section_text:
|
|
44
|
+
if "signature" in current_match.lower():
|
|
45
|
+
key = "signatures"
|
|
46
|
+
else:
|
|
47
|
+
key = f"item{current_match.lower().replace('item', '').strip()}"
|
|
48
|
+
result["document"][key] = section_text
|
|
49
|
+
|
|
50
|
+
last_match, last_pos = matches[-1]
|
|
51
|
+
section_text = WHITESPACE_PATTERN.sub(' ', text[last_pos:len(text)]).strip()
|
|
52
|
+
if section_text:
|
|
53
|
+
if "signature" in last_match.lower():
|
|
54
|
+
key = "signatures"
|
|
55
|
+
else:
|
|
56
|
+
key = f"item{last_match.lower().replace('item', '').strip()}"
|
|
57
|
+
result["document"][key] = section_text
|
|
58
|
+
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
WHITESPACE_PATTERN = re.compile(r'\s+')
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .helper import load_file_content, clean_title
|
|
4
|
+
|
|
5
|
+
ITEM_PATTERN = re.compile(
|
|
6
|
+
r"(?:^[ \t]*)"
|
|
7
|
+
r"(?:"
|
|
8
|
+
r"(?:Item|ITEM)\s*"
|
|
9
|
+
r"(?:"
|
|
10
|
+
r"1\.0[1-4]|"
|
|
11
|
+
r"2\.0[1-6]|"
|
|
12
|
+
r"3\.0[1-3]|"
|
|
13
|
+
r"4\.0[1-2]|"
|
|
14
|
+
r"5\.0[1-8]|"
|
|
15
|
+
r"6\.0[1-5]|"
|
|
16
|
+
r"7\.01|"
|
|
17
|
+
r"8\.01|"
|
|
18
|
+
r"9\.01"
|
|
19
|
+
r")"
|
|
20
|
+
r"|"
|
|
21
|
+
r"SIGNATURES?"
|
|
22
|
+
r")",
|
|
23
|
+
re.IGNORECASE | re.MULTILINE
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
WHITESPACE_PATTERN = re.compile(r'\s+')
|
|
27
|
+
|
|
28
|
+
def parse_section(text: str, start: int, end: int) -> str:
|
|
29
|
+
return WHITESPACE_PATTERN.sub(' ', text[start:end].strip())
|
|
30
|
+
|
|
31
|
+
def validate_section_sequence(matches: list) -> None:
|
|
32
|
+
current_base = None
|
|
33
|
+
|
|
34
|
+
for match, _ in matches:
|
|
35
|
+
base_section = re.match(r'(?:Item|ITEM)\s*(?:\d+\.\d+|\bSIGNATURES?\b)', match)
|
|
36
|
+
if base_section:
|
|
37
|
+
base_section = base_section.group().upper()
|
|
38
|
+
|
|
39
|
+
if current_base is None:
|
|
40
|
+
current_base = base_section
|
|
41
|
+
elif base_section != current_base:
|
|
42
|
+
current_base = base_section
|
|
43
|
+
else:
|
|
44
|
+
raise DuplicateSectionError(f"Section {base_section} appears multiple times before a different section")
|
|
45
|
+
|
|
46
|
+
def parse_8k(filename: Path) -> dict:
|
|
47
|
+
text = load_file_content(filename)
|
|
48
|
+
matches = [(clean_title(m.group().strip()), m.start()) for m in ITEM_PATTERN.finditer(text)]
|
|
49
|
+
|
|
50
|
+
result = {
|
|
51
|
+
"metadata": {"document_name": Path(filename).stem},
|
|
52
|
+
"document": {}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if not matches:
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
validate_section_sequence(matches)
|
|
59
|
+
|
|
60
|
+
# Process all sections except last
|
|
61
|
+
for i, (current_match, start_pos) in enumerate(matches[:-1]):
|
|
62
|
+
section_text = parse_section(text, start_pos, matches[i + 1][1])
|
|
63
|
+
if section_text:
|
|
64
|
+
if "signature" in current_match.lower():
|
|
65
|
+
key = "signatures"
|
|
66
|
+
else:
|
|
67
|
+
key = f"item{current_match.lower().replace('item', '').strip()}"
|
|
68
|
+
result["document"][key] = section_text
|
|
69
|
+
|
|
70
|
+
# Process last section
|
|
71
|
+
last_match, last_pos = matches[-1]
|
|
72
|
+
section_text = parse_section(text, last_pos, len(text))
|
|
73
|
+
if section_text:
|
|
74
|
+
if "signature" in last_match.lower():
|
|
75
|
+
key = "signatures"
|
|
76
|
+
else:
|
|
77
|
+
key = f"item{last_match.lower().replace('item', '').strip()}"
|
|
78
|
+
result["document"][key] = section_text
|
|
79
|
+
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
class DuplicateSectionError(Exception):
|
|
83
|
+
"""Raised when a section appears multiple times before a different section."""
|
|
84
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from xml.etree import ElementTree as ET
|
|
2
|
+
|
|
3
|
+
def element_to_dict(elem):
|
|
4
|
+
"""Convert an XML element to dict preserving structure."""
|
|
5
|
+
result = {}
|
|
6
|
+
|
|
7
|
+
# Add attributes directly to result
|
|
8
|
+
if elem.attrib:
|
|
9
|
+
result.update(elem.attrib)
|
|
10
|
+
|
|
11
|
+
# Add text content if present and no children
|
|
12
|
+
if elem.text and elem.text.strip():
|
|
13
|
+
text = elem.text.strip()
|
|
14
|
+
if not len(elem): # No children
|
|
15
|
+
return text
|
|
16
|
+
else:
|
|
17
|
+
result['text'] = text
|
|
18
|
+
|
|
19
|
+
# Process children
|
|
20
|
+
for child in elem:
|
|
21
|
+
child_data = element_to_dict(child)
|
|
22
|
+
child_tag = child.tag.split('}')[-1] # Remove namespace
|
|
23
|
+
|
|
24
|
+
if child_tag in result:
|
|
25
|
+
# Convert to list if multiple elements
|
|
26
|
+
if not isinstance(result[child_tag], list):
|
|
27
|
+
result[child_tag] = [result[child_tag]]
|
|
28
|
+
result[child_tag].append(child_data)
|
|
29
|
+
else:
|
|
30
|
+
result[child_tag] = child_data
|
|
31
|
+
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
def parse_form_d(filepath):
|
|
35
|
+
"""Parse Form D XML file into metadata and document sections."""
|
|
36
|
+
# Parse XML
|
|
37
|
+
tree = ET.parse(filepath)
|
|
38
|
+
root = tree.getroot()
|
|
39
|
+
|
|
40
|
+
# Remove namespaces for cleaner processing
|
|
41
|
+
for elem in root.iter():
|
|
42
|
+
if '}' in elem.tag:
|
|
43
|
+
elem.tag = elem.tag.split('}')[-1]
|
|
44
|
+
|
|
45
|
+
# Convert entire document to dict
|
|
46
|
+
full_dict = element_to_dict(root)
|
|
47
|
+
|
|
48
|
+
# Separate metadata and document content
|
|
49
|
+
result = {
|
|
50
|
+
'metadata': {},
|
|
51
|
+
'document': {}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Extract metadata
|
|
55
|
+
metadata_fields = {
|
|
56
|
+
'schemaVersion',
|
|
57
|
+
'submissionType',
|
|
58
|
+
'testOrLive',
|
|
59
|
+
'primaryIssuer' # Including all issuer information in metadata
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
for field in metadata_fields:
|
|
63
|
+
if field in full_dict:
|
|
64
|
+
result['metadata'][field] = full_dict[field]
|
|
65
|
+
del full_dict[field] # Remove from full_dict to avoid duplication
|
|
66
|
+
|
|
67
|
+
# Everything else goes to document
|
|
68
|
+
result['document'] = full_dict
|
|
69
|
+
|
|
70
|
+
return result
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Parses e.g. 10-K, 10-Q,..... any form with items and/or parts
|
|
2
|
+
from .helper import load_file_content, clean_title, clean_text
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
# OK figured out general pattern
|
|
7
|
+
# find toc
|
|
8
|
+
# figure out mapping. we do need it
|
|
9
|
+
# just do mapping tonight
|
|
10
|
+
|
|
11
|
+
pattern = re.compile(r'^\s*(?:item\s+\d+(?:\.\d+)?(?:[a-z])?|signature(?:\.?s)?)\s*', re.I | re.M)
|
|
12
|
+
|
|
13
|
+
def find_anchors(content):
|
|
14
|
+
anchors = []
|
|
15
|
+
prev_title = None
|
|
16
|
+
|
|
17
|
+
for part_match in pattern.finditer(content):
|
|
18
|
+
title = clean_title(part_match.group())
|
|
19
|
+
# Skip duplicates, e.g. "item 1" and "item1 continued"
|
|
20
|
+
if prev_title == title:
|
|
21
|
+
continue
|
|
22
|
+
prev_title = title
|
|
23
|
+
anchors.append((title, part_match.start()))
|
|
24
|
+
|
|
25
|
+
return anchors
|
|
26
|
+
|
|
27
|
+
# I think this works, but I haven't tested it extensively.
|
|
28
|
+
def map_sections(content, anchors):
|
|
29
|
+
positions = anchors + [('end', len(content))]
|
|
30
|
+
|
|
31
|
+
result = {}
|
|
32
|
+
for i, (title, start) in enumerate(positions[:-1]):
|
|
33
|
+
_, next_start = positions[i + 1]
|
|
34
|
+
section_text = content[start:next_start].strip()
|
|
35
|
+
result[title.lower()] = clean_text(section_text)
|
|
36
|
+
|
|
37
|
+
def sort_key(x):
|
|
38
|
+
match = re.search(r'item\s+(\d+)(?:[\.a-z])?', x[0], re.I)
|
|
39
|
+
if not match:
|
|
40
|
+
return float('inf')
|
|
41
|
+
num = match.group(0).lower()
|
|
42
|
+
# This will sort 1, 1a, 1b, 2, 2a etc
|
|
43
|
+
return float(re.findall(r'\d+', num)[0]) + (ord(num[-1]) - ord('a') + 1) / 100 if num[-1].isalpha() else float(re.findall(r'\d+', num)[0])
|
|
44
|
+
|
|
45
|
+
return dict(sorted(result.items(), key=sort_key))
|
|
46
|
+
|
|
47
|
+
# def find_content_start(anchors):
|
|
48
|
+
# def find_first_non_repeating(seq):
|
|
49
|
+
# for i in range(len(seq)):
|
|
50
|
+
# remaining = seq[i:]
|
|
51
|
+
# # Get same length subsequence from the next position
|
|
52
|
+
# next_seq = seq[i + 1:i + 1 + len(remaining)]
|
|
53
|
+
# if remaining != next_seq and len(next_seq) > 0:
|
|
54
|
+
# return i
|
|
55
|
+
# return 0 # Default to start if no pattern found
|
|
56
|
+
|
|
57
|
+
# return find_first_non_repeating([title for title, _ in anchors])
|
|
58
|
+
|
|
59
|
+
def generalized_parser(filename):
|
|
60
|
+
# load content
|
|
61
|
+
content = load_file_content(filename)
|
|
62
|
+
|
|
63
|
+
# find anchors
|
|
64
|
+
anchors = find_anchors(content)
|
|
65
|
+
|
|
66
|
+
# Skip tables of contents. Not implemented yet, since we overwrite the keys anyway.
|
|
67
|
+
# content_start = find_content_start(anchors)
|
|
68
|
+
# print(content_start)
|
|
69
|
+
|
|
70
|
+
result = {}
|
|
71
|
+
# assign metadata
|
|
72
|
+
result["metadata"] = {"document_name": Path(filename).stem}
|
|
73
|
+
|
|
74
|
+
# extract sections, assign text based on mapping_dict
|
|
75
|
+
result['document'] = map_sections(content, anchors)
|
|
76
|
+
|
|
77
|
+
return result
|
|
78
|
+
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from selectolax.parser import HTMLParser
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
# This will be modified in the future to remove SEC specific code such as <PAGE> tags
|
|
5
|
+
def load_text_content(filename):
|
|
6
|
+
with open(filename) as f:
|
|
7
|
+
return f.read().translate(str.maketrans({
|
|
8
|
+
'\xa0': ' ', '\u2003': ' ',
|
|
9
|
+
'\u2018': "'", '\u2019': "'",
|
|
10
|
+
'\u201c': '"', '\u201d': '"'
|
|
11
|
+
}))
|
|
12
|
+
|
|
13
|
+
def load_html_content(filename):
|
|
14
|
+
parser = HTMLParser(open(filename).read())
|
|
15
|
+
|
|
16
|
+
# Remove hidden elements first
|
|
17
|
+
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
|
18
|
+
for node in hidden_nodes:
|
|
19
|
+
node.decompose()
|
|
20
|
+
|
|
21
|
+
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
|
22
|
+
lines = []
|
|
23
|
+
current_line = []
|
|
24
|
+
|
|
25
|
+
def flush_line():
|
|
26
|
+
if current_line:
|
|
27
|
+
lines.append(' '.join(current_line))
|
|
28
|
+
current_line.clear()
|
|
29
|
+
|
|
30
|
+
for node in parser.root.traverse(include_text=True):
|
|
31
|
+
if node.tag in ('script', 'style', 'css'):
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
if node.tag in blocks:
|
|
35
|
+
flush_line()
|
|
36
|
+
lines.append('')
|
|
37
|
+
|
|
38
|
+
if node.text_content:
|
|
39
|
+
text = node.text_content.strip()
|
|
40
|
+
if text:
|
|
41
|
+
if node.tag in blocks:
|
|
42
|
+
flush_line()
|
|
43
|
+
lines.append(text)
|
|
44
|
+
lines.append('')
|
|
45
|
+
else:
|
|
46
|
+
current_line.append(text)
|
|
47
|
+
|
|
48
|
+
flush_line()
|
|
49
|
+
|
|
50
|
+
text = '\n'.join(lines)
|
|
51
|
+
while '\n\n\n' in text:
|
|
52
|
+
text = text.replace('\n\n\n', '\n\n')
|
|
53
|
+
|
|
54
|
+
return text.translate(str.maketrans({
|
|
55
|
+
'\xa0': ' ', '\u2003': ' ',
|
|
56
|
+
'\u2018': "'", '\u2019': "'",
|
|
57
|
+
'\u201c': '"', '\u201d': '"'
|
|
58
|
+
}))
|
|
59
|
+
def load_file_content(filename):
|
|
60
|
+
if filename.suffix =='.txt':
|
|
61
|
+
return load_text_content(filename)
|
|
62
|
+
elif filename.suffix in ['.html','.htm']:
|
|
63
|
+
return load_html_content(filename)
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError(f"Unsupported file type: {filename}")
|
|
66
|
+
|
|
67
|
+
def clean_title(title: str) -> str:
|
|
68
|
+
"""Clean up section title by removing newlines, periods, and all whitespace, converting to lowercase."""
|
|
69
|
+
return ''.join(title.replace('\n', '').replace('.', '').split()).lower()
|
|
70
|
+
|
|
71
|
+
# This is a bit hacky, removes PART IV, PART V etc from the end of the text
|
|
72
|
+
# we do this to avoid having to map for general cases
|
|
73
|
+
def clean_text(text):
|
|
74
|
+
text = text.strip()
|
|
75
|
+
return re.sub(r'\s*PART\s+[IVX]+\s*$', '', text, flags=re.I)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from xml.etree import ElementTree as ET
|
|
2
|
+
|
|
3
|
+
def parse_13f_hr_information_table_xml(xml_file):
|
|
4
|
+
# Parse the XML file
|
|
5
|
+
tree = ET.parse(xml_file)
|
|
6
|
+
root = tree.getroot()
|
|
7
|
+
|
|
8
|
+
data = []
|
|
9
|
+
|
|
10
|
+
# Iterate through each infoTable
|
|
11
|
+
for info_table in root.findall('.//{*}infoTable'):
|
|
12
|
+
row = {
|
|
13
|
+
'NAMEOFISSUER': info_table.findtext('.//{*}nameOfIssuer') or '',
|
|
14
|
+
'TITLEOFCLASS': info_table.findtext('.//{*}titleOfClass') or '',
|
|
15
|
+
'CUSIP': info_table.findtext('.//{*}cusip') or '',
|
|
16
|
+
'FIGI': info_table.findtext('.//{*}figi') or '',
|
|
17
|
+
'VALUE': info_table.findtext('.//{*}value') or '',
|
|
18
|
+
'SSHPRNAMT': '',
|
|
19
|
+
'SSHPRNAMTTYPE': '',
|
|
20
|
+
'PUTCALL': info_table.findtext('.//{*}putCall') or '',
|
|
21
|
+
'INVESTMENTDISCRETION': info_table.findtext('.//{*}investmentDiscretion') or '',
|
|
22
|
+
'OTHERMANAGER': info_table.findtext('.//{*}otherManager') or '',
|
|
23
|
+
'VOTING_AUTH_SOLE': '',
|
|
24
|
+
'VOTING_AUTH_SHARED': '',
|
|
25
|
+
'VOTING_AUTH_NONE': ''
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
shrs_or_prn_amt = info_table.find('.//{*}shrsOrPrnAmt')
|
|
29
|
+
if shrs_or_prn_amt is not None:
|
|
30
|
+
row['SSHPRNAMT'] = shrs_or_prn_amt.findtext('.//{*}sshPrnamt') or ''
|
|
31
|
+
row['SSHPRNAMTTYPE'] = shrs_or_prn_amt.findtext('.//{*}sshPrnamtType') or ''
|
|
32
|
+
|
|
33
|
+
voting_authority = info_table.find('.//{*}votingAuthority')
|
|
34
|
+
if voting_authority is not None:
|
|
35
|
+
row['VOTING_AUTH_SOLE'] = voting_authority.findtext('.//{*}Sole') or ''
|
|
36
|
+
row['VOTING_AUTH_SHARED'] = voting_authority.findtext('.//{*}Shared') or ''
|
|
37
|
+
row['VOTING_AUTH_NONE'] = voting_authority.findtext('.//{*}None') or ''
|
|
38
|
+
|
|
39
|
+
data.append(row)
|
|
40
|
+
|
|
41
|
+
return data
|