datamule 1.4.5__py3-none-any.whl → 1.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,19 +11,7 @@ from selectolax.parser import HTMLParser
11
11
  from .processing import process_tabular_data
12
12
  from pathlib import Path
13
13
  import webbrowser
14
-
15
- def convert_bytes_keys(obj):
16
- if isinstance(obj, dict):
17
- return {
18
- (k.decode('utf-8').lower() if isinstance(k, bytes) else k): convert_bytes_keys(v)
19
- for k, v in obj.items()
20
- }
21
- elif isinstance(obj, list):
22
- return [convert_bytes_keys(item) for item in obj]
23
- elif isinstance(obj, bytes):
24
- return obj.decode('utf-8').lower()
25
- else:
26
- return obj
14
+ from secsgml.utils import bytes_to_str
27
15
 
28
16
  class Document:
29
17
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -34,7 +22,8 @@ class Document:
34
22
  self.filing_date = filing_date
35
23
 
36
24
  if self.type == 'submission_metadata':
37
- self.content = convert_bytes_keys(content)
25
+ # this converts to lower
26
+ self.content = bytes_to_str(content)
38
27
  else:
39
28
  self.content = content
40
29
 
datamule/index.py CHANGED
@@ -9,7 +9,7 @@ class Index:
9
9
 
10
10
  def search_submissions(
11
11
  self,
12
- text_query,
12
+ text_query=None,
13
13
  filing_date=None,
14
14
  submission_type=None,
15
15
  cik=None,
@@ -7,14 +7,16 @@ class TextSearchEFTSQuery(EFTSQuery):
7
7
  """
8
8
  def __init__(self, text_query, requests_per_second=5.0, quiet=False):
9
9
  super().__init__(requests_per_second=requests_per_second, quiet=quiet)
10
- self.text_query = text_query
10
+ if text_query is not None:
11
+ self.text_query = text_query
11
12
 
12
13
  def _prepare_params(self, cik=None, submission_type=None, filing_date=None, location=None):
13
14
  # Get base parameters from parent class
14
15
  params = super()._prepare_params(cik, submission_type, filing_date, location)
15
16
 
16
17
  # Add text query parameter
17
- params['q'] = self.text_query
18
+ if self.text_query is not None:
19
+ params['q'] = self.text_query
18
20
 
19
21
  return params
20
22
 
@@ -42,7 +44,7 @@ async def extract_accession_numbers(hits):
42
44
  accession_numbers.append(acc_no)
43
45
  return accession_numbers
44
46
 
45
- def query(text_query, cik=None, submission_type=None, filing_date=None, location=None,
47
+ def query(text_query=None, cik=None, submission_type=None, filing_date=None, location=None,
46
48
  name=None, requests_per_second=5.0, quiet=False):
47
49
  """
48
50
  Search SEC filings for text and return the full search results.
datamule/submission.py CHANGED
@@ -4,72 +4,8 @@ from .document.document import Document
4
4
  from secsgml import parse_sgml_content_into_memory
5
5
  import os
6
6
  import aiofiles
7
- import tempfile
8
-
9
-
10
- # # NEW CODE YAY. probably will remove
11
-
12
- # def save_metadata_atomically(metadata_file_path, metadata_content):
13
- # """Save metadata to a JSONL file atomically, works on any filesystem"""
14
-
15
- # # Create directory if it doesn't exist
16
- # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
17
-
18
- # # Format the JSON with newline
19
- # json_str = json.dumps(metadata_content, indent=4) + "\n"
20
-
21
- # # Write complete content to a temporary file first
22
- # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
23
- # try:
24
- # with os.fdopen(fd, 'w') as temp_file:
25
- # temp_file.write(json_str)
26
- # temp_file.flush()
27
- # os.fsync(temp_file.fileno()) # Force write to disk
28
-
29
- # # Append the temporary file to the main file
30
- # with open(metadata_file_path, 'a') as target_file:
31
- # with open(temp_path, 'r') as temp_read:
32
- # content = temp_read.read()
33
- # target_file.write(content)
34
- # target_file.flush()
35
- # os.fsync(target_file.fileno()) # Force write to disk
36
- # finally:
37
- # # Clean up the temporary file
38
- # if os.path.exists(temp_path):
39
- # os.unlink(temp_path)
40
-
41
- # async def save_metadata_atomically_async(metadata_file_path, metadata_content):
42
- # """Save metadata to a JSONL file atomically in async mode"""
43
-
44
- # # Create directory if it doesn't exist
45
- # os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True)
46
-
47
- # # Format the JSON with newline
48
- # json_str = json.dumps(metadata_content, indent=4) + "\n"
49
-
50
- # # Write to a temporary file first
51
- # fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(metadata_file_path))
52
- # os.close(fd) # Close the file descriptor
53
-
54
- # try:
55
- # async with aiofiles.open(temp_path, 'w') as temp_file:
56
- # await temp_file.write(json_str)
57
- # await temp_file.flush()
58
-
59
- # # Append the temporary file to the main file
60
- # async with aiofiles.open(metadata_file_path, 'a') as target_file:
61
- # async with aiofiles.open(temp_path, 'r') as temp_read:
62
- # content = await temp_read.read()
63
- # await target_file.write(content)
64
- # await target_file.flush()
65
- # finally:
66
- # # Clean up the temporary file
67
- # if os.path.exists(temp_path):
68
- # os.unlink(temp_path)
69
-
70
- # # END OF NEW CODE
71
-
72
7
 
8
+ # TODO add .tar path
73
9
  class Submission:
74
10
  def __init__(self, path=None,sgml_content=None,keep_document_types=None):
75
11
  if path is None and sgml_content is None:
@@ -89,7 +25,7 @@ class Submission:
89
25
  filtered_metadata_documents = []
90
26
 
91
27
  for idx,doc in enumerate(self.metadata.content['documents']):
92
- type = doc.get('type')
28
+ type = doc.get('type').upper()
93
29
 
94
30
  # Keep only specified types
95
31
  if keep_document_types is not None and type not in keep_document_types:
@@ -98,7 +34,7 @@ class Submission:
98
34
  # write as txt if not declared
99
35
  filename = doc.get('filename','.txt')
100
36
  extension = Path(filename).suffix
101
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
37
+ self.documents.append(Document(type=type.upper(), content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
102
38
 
103
39
  filtered_metadata_documents.append(doc)
104
40
 
@@ -121,9 +57,9 @@ class Submission:
121
57
  def document_type(self, document_type):
122
58
  # Convert single document type to list for consistent handling
123
59
  if isinstance(document_type, str):
124
- document_types = [document_type]
60
+ document_types = [document_type.lower()]
125
61
  else:
126
- document_types = document_type
62
+ document_types = [item.lower() for item in document_type]
127
63
 
128
64
  for idx,doc in enumerate(self.metadata.content['documents']):
129
65
  if doc['type'] in document_types:
@@ -144,7 +80,7 @@ class Submission:
144
80
  if extension in ['.htm','.html','.txt','.xml']:
145
81
  content = content.decode('utf-8', errors='replace')
146
82
 
147
- yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
83
+ yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
148
84
  # if loaded from sgml_content
149
85
  else:
150
86
  yield self.documents[idx]
@@ -171,7 +107,7 @@ class Submission:
171
107
  if extension in ['.htm','.html','.txt','.xml']:
172
108
  content = content.decode('utf-8', errors='replace')
173
109
 
174
- yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
110
+ yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
175
111
  else:
176
112
  print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
177
113
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.4.5
3
+ Version: 1.4.9
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,14 +1,14 @@
1
1
  datamule/__init__.py,sha256=glzwBeGJEE6-TG7mRule9GH6L59XaIRR9T7ALcdpMus,1067
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
3
  datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
4
- datamule/index.py,sha256=_7Ox5hyF_7RWdblVFr5rNyv_ARwBP7VY4f703pk9qQ8,2074
4
+ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
5
5
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
6
  datamule/portfolio.py,sha256=8fiK-vfZM5-NJSvOEsDR2YDb-2njjzFk6l7BiRyrzOM,7168
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=EtWdEnAyWLZdu69Dyzbs4qb5YL41HlExFGMjwEoMhsg,10904
8
+ datamule/submission.py,sha256=UTGIkXOFVrBTHLJxTekw60Nvp92GuSsgQApJWXHNuNg,8493
9
9
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
10
10
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- datamule/document/document.py,sha256=3vX850H7rZH4H8BysitZDaLhT6WPJuIreoV1PSjACno,14301
11
+ datamule/document/document.py,sha256=VaJWo9HrcODlbifYcXzifW3xBD7nUOWAN8zcVCDWMcs,13958
12
12
  datamule/document/processing.py,sha256=jDCEzBFDSQtq7nQxRScIsbALnFcvMPOkNkMUCa7mFxg,31921
13
13
  datamule/document/table.py,sha256=73yUJKY82ap32jhLmZeTti-jQ_lyhcJGlGwyxLtgYOg,12944
14
14
  datamule/document/mappings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -48,7 +48,7 @@ datamule/sec/submissions/downloader.py,sha256=izaz559PtBCAWPWGzqUReloawJtXwnracl
48
48
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
49
49
  datamule/sec/submissions/monitor.py,sha256=dZYuVCi_X82eYA8l_9cbnkRjiawz3K4U-FnCAyJcgk4,7892
50
50
  datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
51
- datamule/sec/submissions/textsearch.py,sha256=zEr3NXdhVFL8eMh2jruVXIt7taUZTMdNy2hOAyRM2pA,5706
51
+ datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
52
52
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
54
54
  datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
@@ -58,7 +58,7 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
58
58
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
59
59
  datamule/seclibrary/downloader.py,sha256=VIdaQq5wDcYWnqrv9t8J7z0KtdNRGK8ahfBsgvTfdQQ,13675
60
60
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
61
- datamule-1.4.5.dist-info/METADATA,sha256=oKn8LWnuOVozqGyVe7ggRcCmoyjy7vwZbgZG7AKYWGY,469
62
- datamule-1.4.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
- datamule-1.4.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
- datamule-1.4.5.dist-info/RECORD,,
61
+ datamule-1.4.9.dist-info/METADATA,sha256=ojII-iQoWPNIJXs28FNsQjExKxOdQROgYsIHdOfMw6I,469
62
+ datamule-1.4.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
63
+ datamule-1.4.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
64
+ datamule-1.4.9.dist-info/RECORD,,