datamule 1.4.5__py3-none-any.whl → 1.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document/document.py +3 -14
- datamule/index.py +1 -1
- datamule/sec/submissions/textsearch.py +5 -3
- datamule/submission.py +7 -71
- {datamule-1.4.5.dist-info → datamule-1.4.9.dist-info}/METADATA +1 -1
- {datamule-1.4.5.dist-info → datamule-1.4.9.dist-info}/RECORD +8 -8
- {datamule-1.4.5.dist-info → datamule-1.4.9.dist-info}/WHEEL +0 -0
- {datamule-1.4.5.dist-info → datamule-1.4.9.dist-info}/top_level.txt +0 -0
datamule/document/document.py
CHANGED
@@ -11,19 +11,7 @@ from selectolax.parser import HTMLParser
|
|
11
11
|
from .processing import process_tabular_data
|
12
12
|
from pathlib import Path
|
13
13
|
import webbrowser
|
14
|
-
|
15
|
-
def convert_bytes_keys(obj):
|
16
|
-
if isinstance(obj, dict):
|
17
|
-
return {
|
18
|
-
(k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
|
19
|
-
for k, v in obj.items()
|
20
|
-
}
|
21
|
-
elif isinstance(obj, list):
|
22
|
-
return [convert_bytes_keys(item) for item in obj]
|
23
|
-
elif isinstance(obj, bytes):
|
24
|
-
return obj.decode('utf-8').lower()
|
25
|
-
else:
|
26
|
-
return obj
|
14
|
+
from secsgml.utils import bytes_to_str
|
27
15
|
|
28
16
|
class Document:
|
29
17
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
@@ -34,7 +22,8 @@ class Document:
|
|
34
22
|
self.filing_date = filing_date
|
35
23
|
|
36
24
|
if self.type == 'submission_metadata':
|
37
|
-
|
25
|
+
# this converts to lower
|
26
|
+
self.content = bytes_to_str(content)
|
38
27
|
else:
|
39
28
|
self.content = content
|
40
29
|
|
datamule/index.py
CHANGED
@@ -7,14 +7,16 @@ class TextSearchEFTSQuery(EFTSQuery):
|
|
7
7
|
"""
|
8
8
|
def __init__(self, text_query, requests_per_second=5.0, quiet=False):
|
9
9
|
super().__init__(requests_per_second=requests_per_second, quiet=quiet)
|
10
|
-
|
10
|
+
if text_query is not None:
|
11
|
+
self.text_query = text_query
|
11
12
|
|
12
13
|
def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
|
13
14
|
# Get base parameters from parent class
|
14
15
|
params = super()._prepare_params(cik, submission_type, filing_date, location)
|
15
16
|
|
16
17
|
# Add text query parameter
|
17
|
-
|
18
|
+
if self.text_query is not None:
|
19
|
+
params['q'] = self.text_query
|
18
20
|
|
19
21
|
return params
|
20
22
|
|
@@ -42,7 +44,7 @@ async def extract_accession_numbers(hits):
|
|
42
44
|
accession_numbers.append(acc_no)
|
43
45
|
return accession_numbers
|
44
46
|
|
45
|
-
def query(text_query, cik=None, submission_type=None, filing_date=None, location=None,
|
47
|
+
def query(text_query=None, cik=None, submission_type=None, filing_date=None, location=None,
|
46
48
|
name=None, requests_per_second=5.0, quiet=False):
|
47
49
|
"""
|
48
50
|
Search SEC filings for text and return the full search results.
|
datamule/submission.py
CHANGED
@@ -4,72 +4,8 @@ from .document.document import Document
|
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
5
|
import os
|
6
6
|
import aiofiles
|
7
|
-
import tempfile
|
8
|
-
|
9
|
-
|
10
|
-
# # NEW CODE YAY. probably will remove
|
11
|
-
|
12
|
-
# def save_metadata_atomically(metadata_file_path, metadata_content):
|
13
|
-
# """Save metadata to a JSONL file atomically, works on any filesystem"""
|
14
|
-
|
15
|
-
# # Create directory if it doesn't exist
|
16
|
-
# os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
|
17
|
-
|
18
|
-
# # Format the JSON with newline
|
19
|
-
# json_str = json.dumps(metadata_content, indent=4) + "\n"
|
20
|
-
|
21
|
-
# # Write complete content to a temporary file first
|
22
|
-
# fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
|
23
|
-
# try:
|
24
|
-
# with os.fdopen(fd, 'w') as temp_file:
|
25
|
-
# temp_file.write(json_str)
|
26
|
-
# temp_file.flush()
|
27
|
-
# os.fsync(temp_file.fileno()) # Force write to disk
|
28
|
-
|
29
|
-
# # Append the temporary file to the main file
|
30
|
-
# with open(metadata_file_path, 'a') as target_file:
|
31
|
-
# with open(temp_path, 'r') as temp_read:
|
32
|
-
# content = temp_read.read()
|
33
|
-
# target_file.write(content)
|
34
|
-
# target_file.flush()
|
35
|
-
# os.fsync(target_file.fileno()) # Force write to disk
|
36
|
-
# finally:
|
37
|
-
# # Clean up the temporary file
|
38
|
-
# if os.path.exists(temp_path):
|
39
|
-
# os.unlink(temp_path)
|
40
|
-
|
41
|
-
# async def save_metadata_atomically_async(metadata_file_path, metadata_content):
|
42
|
-
# """Save metadata to a JSONL file atomically in async mode"""
|
43
|
-
|
44
|
-
# # Create directory if it doesn't exist
|
45
|
-
# os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
|
46
|
-
|
47
|
-
# # Format the JSON with newline
|
48
|
-
# json_str = json.dumps(metadata_content, indent=4) + "\n"
|
49
|
-
|
50
|
-
# # Write to a temporary file first
|
51
|
-
# fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
|
52
|
-
# os.close(fd) # Close the file descriptor
|
53
|
-
|
54
|
-
# try:
|
55
|
-
# async with aiofiles.open(temp_path, 'w') as temp_file:
|
56
|
-
# await temp_file.write(json_str)
|
57
|
-
# await temp_file.flush()
|
58
|
-
|
59
|
-
# # Append the temporary file to the main file
|
60
|
-
# async with aiofiles.open(metadata_file_path, 'a') as target_file:
|
61
|
-
# async with aiofiles.open(temp_path, 'r') as temp_read:
|
62
|
-
# content = await temp_read.read()
|
63
|
-
# await target_file.write(content)
|
64
|
-
# await target_file.flush()
|
65
|
-
# finally:
|
66
|
-
# # Clean up the temporary file
|
67
|
-
# if os.path.exists(temp_path):
|
68
|
-
# os.unlink(temp_path)
|
69
|
-
|
70
|
-
# # END OF NEW CODE
|
71
|
-
|
72
7
|
|
8
|
+
# TODO add .tar path
|
73
9
|
class Submission:
|
74
10
|
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
75
11
|
if path is None and sgml_content is None:
|
@@ -89,7 +25,7 @@ class Submission:
|
|
89
25
|
filtered_metadata_documents = []
|
90
26
|
|
91
27
|
for idx,doc in enumerate(self.metadata.content['documents']):
|
92
|
-
type = doc.get('type')
|
28
|
+
type = doc.get('type').upper()
|
93
29
|
|
94
30
|
# Keep only specified types
|
95
31
|
if keep_document_types is not None and type not in keep_document_types:
|
@@ -98,7 +34,7 @@ class Submission:
|
|
98
34
|
# write as txt if not declared
|
99
35
|
filename = doc.get('filename','.txt')
|
100
36
|
extension = Path(filename).suffix
|
101
|
-
self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
37
|
+
self.documents.append(Document(type=type.upper(), content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
|
102
38
|
|
103
39
|
filtered_metadata_documents.append(doc)
|
104
40
|
|
@@ -121,9 +57,9 @@ class Submission:
|
|
121
57
|
def document_type(self, document_type):
|
122
58
|
# Convert single document type to list for consistent handling
|
123
59
|
if isinstance(document_type, str):
|
124
|
-
document_types = [document_type]
|
60
|
+
document_types = [document_type.lower()]
|
125
61
|
else:
|
126
|
-
document_types = document_type
|
62
|
+
document_types = [item.lower() for item in document_type]
|
127
63
|
|
128
64
|
for idx,doc in enumerate(self.metadata.content['documents']):
|
129
65
|
if doc['type'] in document_types:
|
@@ -144,7 +80,7 @@ class Submission:
|
|
144
80
|
if extension in ['.htm','.html','.txt','.xml']:
|
145
81
|
content = content.decode('utf-8', errors='replace')
|
146
82
|
|
147
|
-
yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
83
|
+
yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
148
84
|
# if loaded from sgml_content
|
149
85
|
else:
|
150
86
|
yield self.documents[idx]
|
@@ -171,7 +107,7 @@ class Submission:
|
|
171
107
|
if extension in ['.htm','.html','.txt','.xml']:
|
172
108
|
content = content.decode('utf-8', errors='replace')
|
173
109
|
|
174
|
-
yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
110
|
+
yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
|
175
111
|
else:
|
176
112
|
print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
|
177
113
|
|
@@ -1,14 +1,14 @@
|
|
1
1
|
datamule/__init__.py,sha256=glzwBeGJEE6-TG7mRule9GH6L59XaIRR9T7ALcdpMus,1067
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
|
-
datamule/index.py,sha256=
|
4
|
+
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
6
|
datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=UTGIkXOFVrBTHLJxTekw60Nvp92GuSsgQApJWXHNuNg,8493
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
datamule/document/document.py,sha256=
|
11
|
+
datamule/document/document.py,sha256=VaJWo9HrcODlbifYcXzifW3xBD7nUOWAN8zcVCDWMcs,13958
|
12
12
|
datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
|
13
13
|
datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
|
14
14
|
datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -48,7 +48,7 @@ datamule/sec/submissions/downloader.py,sha256=izaz559PtBCAWPWGzqUReloawJtXwnracl
|
|
48
48
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
49
49
|
datamule/sec/submissions/monitor.py,sha256=dZYuVCi_X82eYA8l_9cbnkRjiawz3K4U-FnCAyJcgk4,7892
|
50
50
|
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
51
|
-
datamule/sec/submissions/textsearch.py,sha256=
|
51
|
+
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
52
52
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
|
54
54
|
datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
|
@@ -58,7 +58,7 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
58
58
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
59
59
|
datamule/seclibrary/downloader.py,sha256=VIdaQq5wDcYWnqrv9t8J7z0KtdNRGK8ahfBsgvTfdQQ,13675
|
60
60
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
61
|
-
datamule-1.4.
|
62
|
-
datamule-1.4.
|
63
|
-
datamule-1.4.
|
64
|
-
datamule-1.4.
|
61
|
+
datamule-1.4.9.dist-info/METADATA,sha256=ojII-iQoWPNIJXs28FNsQjExKxOdQROgYsIHdOfMw6I,469
|
62
|
+
datamule-1.4.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-1.4.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-1.4.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|