datamule 1.4.6__py3-none-any.whl → 1.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,19 +11,7 @@ from selectolax.parser import HTMLParser
11
11
  from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
-
15
- def convert_bytes_keys(obj):
16
- if isinstance(obj, dict):
17
- return {
18
- (k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
19
- for k, v in obj.items()
20
- }
21
- elif isinstance(obj, list):
22
- return [convert_bytes_keys(item) for item in obj]
23
- elif isinstance(obj, bytes):
24
- return obj.decode('utf-8').lower()
25
- else:
26
- return obj
14
+ from secsgml.utils import bytes_to_str
27
15
 
28
16
  class Document:
29
17
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -34,7 +22,8 @@ class Document:
34
22
  self.filing_date = filing_date
35
23
 
36
24
  if self.type == 'submission_metadata':
37
- self.content = convert_bytes_keys(content)
25
+ # this converts to lower
26
+ self.content = bytes_to_str(content)
38
27
  else:
39
28
  self.content = content
40
29
 
datamule/submission.py CHANGED
@@ -4,72 +4,8 @@ from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
5
  import os
6
6
  import aiofiles
7
- import tempfile
8
-
9
-
10
- # # NEW CODE YAY. probably will remove
11
-
12
- # def save_metadata_atomically(metadata_file_path, metadata_content):
13
- # """Save metadata to a JSONL file atomically, works on any filesystem"""
14
-
15
- # # Create directory if it doesn't exist
16
- # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
17
-
18
- # # Format the JSON with newline
19
- # json_str = json.dumps(metadata_content, indent=4) + "\n"
20
-
21
- # # Write complete content to a temporary file first
22
- # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
23
- # try:
24
- # with os.fdopen(fd, 'w') as temp_file:
25
- # temp_file.write(json_str)
26
- # temp_file.flush()
27
- # os.fsync(temp_file.fileno()) # Force write to disk
28
-
29
- # # Append the temporary file to the main file
30
- # with open(metadata_file_path, 'a') as target_file:
31
- # with open(temp_path, 'r') as temp_read:
32
- # content = temp_read.read()
33
- # target_file.write(content)
34
- # target_file.flush()
35
- # os.fsync(target_file.fileno()) # Force write to disk
36
- # finally:
37
- # # Clean up the temporary file
38
- # if os.path.exists(temp_path):
39
- # os.unlink(temp_path)
40
-
41
- # async def save_metadata_atomically_async(metadata_file_path, metadata_content):
42
- # """Save metadata to a JSONL file atomically in async mode"""
43
-
44
- # # Create directory if it doesn't exist
45
- # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
46
-
47
- # # Format the JSON with newline
48
- # json_str = json.dumps(metadata_content, indent=4) + "\n"
49
-
50
- # # Write to a temporary file first
51
- # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
52
- # os.close(fd) # Close the file descriptor
53
-
54
- # try:
55
- # async with aiofiles.open(temp_path, 'w') as temp_file:
56
- # await temp_file.write(json_str)
57
- # await temp_file.flush()
58
-
59
- # # Append the temporary file to the main file
60
- # async with aiofiles.open(metadata_file_path, 'a') as target_file:
61
- # async with aiofiles.open(temp_path, 'r') as temp_read:
62
- # content = await temp_read.read()
63
- # await target_file.write(content)
64
- # await target_file.flush()
65
- # finally:
66
- # # Clean up the temporary file
67
- # if os.path.exists(temp_path):
68
- # os.unlink(temp_path)
69
-
70
- # # END OF NEW CODE
71
-
72
7
 
8
+ # TODO add .tar path
73
9
  class Submission:
74
10
  def __init__(self, path=None,sgml_content=None,keep_document_types=None):
75
11
  if path is None and sgml_content is None:
@@ -89,7 +25,7 @@ class Submission:
89
25
  filtered_metadata_documents = []
90
26
 
91
27
  for idx,doc in enumerate(self.metadata.content['documents']):
92
- type = doc.get('type')
28
+ type = doc.get('type').upper()
93
29
 
94
30
  # Keep only specified types
95
31
  if keep_document_types is not None and type not in keep_document_types:
@@ -98,7 +34,7 @@ class Submission:
98
34
  # write as txt if not declared
99
35
  filename = doc.get('filename','.txt')
100
36
  extension = Path(filename).suffix
101
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
37
+ self.documents.append(Document(type=type.upper(), content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
102
38
 
103
39
  filtered_metadata_documents.append(doc)
104
40
 
@@ -121,9 +57,9 @@ class Submission:
121
57
  def document_type(self, document_type):
122
58
  # Convert single document type to list for consistent handling
123
59
  if isinstance(document_type, str):
124
- document_types = [document_type]
60
+ document_types = [document_type.lower()]
125
61
  else:
126
- document_types = document_type
62
+ document_types = [item.lower() for item in document_type]
127
63
 
128
64
  for idx,doc in enumerate(self.metadata.content['documents']):
129
65
  if doc['type'] in document_types:
@@ -144,7 +80,7 @@ class Submission:
144
80
  if extension in ['.htm','.html','.txt','.xml']:
145
81
  content = content.decode('utf-8', errors='replace')
146
82
 
147
- yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
83
+ yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
148
84
  # if loaded from sgml_content
149
85
  else:
150
86
  yield self.documents[idx]
@@ -171,7 +107,7 @@ class Submission:
171
107
  if extension in ['.htm','.html','.txt','.xml']:
172
108
  content = content.decode('utf-8', errors='replace')
173
109
 
174
- yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
110
+ yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
175
111
  else:
176
112
  print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
177
113
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.6
3
+ Version: 1.4.9
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -5,10 +5,10 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
6
  datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=EtWdEnAyWLZdu69Dyzbs4qb5YL41HlExFGMjwEoMhsg,10904
8
+ datamule/submission.py,sha256=UTGIkXOFVrBTHLJxTekw60Nvp92GuSsgQApJWXHNuNg,8493
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- datamule/document/document.py,sha256=3vX850H7rZH4H8BysitZDaLhT6WPJuIreoV1PSjACno,14301
11
+ datamule/document/document.py,sha256=VaJWo9HrcODlbifYcXzifW3xBD7nUOWAN8zcVCDWMcs,13958
12
12
  datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
13
13
  datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
14
14
  datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -58,7 +58,7 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
58
58
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
59
59
  datamule/seclibrary/downloader.py,sha256=VIdaQq5wDcYWnqrv9t8J7z0KtdNRGK8ahfBsgvTfdQQ,13675
60
60
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
61
- datamule-1.4.6.dist-info/METADATA,sha256=IxggkAHbjanZjnTtWGNOyRM68sztal4gQlUfa0shlXg,469
62
- datamule-1.4.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
- datamule-1.4.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
- datamule-1.4.6.dist-info/RECORD,,
61
+ datamule-1.4.9.dist-info/METADATA,sha256=ojII-iQoWPNIJXs28FNsQjExKxOdQROgYsIHdOfMw6I,469
62
+ datamule-1.4.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
+ datamule-1.4.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
+ datamule-1.4.9.dist-info/RECORD,,