datamule 1.6.3__tar.gz → 1.6.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {datamule-1.6.3 → datamule-1.6.5}/PKG-INFO +1 -1
  2. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/submissions/monitor.py +9 -1
  3. {datamule-1.6.3 → datamule-1.6.5}/datamule/submission.py +50 -18
  4. {datamule-1.6.3 → datamule-1.6.5}/datamule.egg-info/PKG-INFO +1 -1
  5. {datamule-1.6.3 → datamule-1.6.5}/setup.py +1 -1
  6. {datamule-1.6.3 → datamule-1.6.5}/datamule/__init__.py +0 -0
  7. {datamule-1.6.3 → datamule-1.6.5}/datamule/config.py +0 -0
  8. {datamule-1.6.3 → datamule-1.6.5}/datamule/data/listed_filer_metadata.csv +0 -0
  9. {datamule-1.6.3 → datamule-1.6.5}/datamule/datamule/__init__.py +0 -0
  10. {datamule-1.6.3 → datamule-1.6.5}/datamule/datamule/sec_connector.py +0 -0
  11. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/__init__.py +0 -0
  12. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/document.py +0 -0
  13. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/__init__.py +0 -0
  14. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/atsn.py +0 -0
  15. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/cfportal.py +0 -0
  16. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/d.py +0 -0
  17. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ex102_abs.py +0 -0
  18. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ex99a_sdr.py +0 -0
  19. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ex99c_sdr.py +0 -0
  20. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ex99g_sdr.py +0 -0
  21. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ex99i_sdr.py +0 -0
  22. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/information_table.py +0 -0
  23. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/nmfp.py +0 -0
  24. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/npx.py +0 -0
  25. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/onefourtyfour.py +0 -0
  26. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ownership.py +0 -0
  27. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/proxy_voting_record.py +0 -0
  28. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/sbs.py +0 -0
  29. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/sbsef.py +0 -0
  30. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/schedule13.py +0 -0
  31. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/sdr.py +0 -0
  32. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/submission_metadata.py +0 -0
  33. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/ta.py +0 -0
  34. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/thirteenfhr.py +0 -0
  35. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/twentyfivense.py +0 -0
  36. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  37. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings_new/__init__.py +0 -0
  38. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings_new/mappings.py +0 -0
  39. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/mappings_new/ownership.py +0 -0
  40. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/processing.py +0 -0
  41. {datamule-1.6.3 → datamule-1.6.5}/datamule/document/table.py +0 -0
  42. {datamule-1.6.3 → datamule-1.6.5}/datamule/helper.py +0 -0
  43. {datamule-1.6.3 → datamule-1.6.5}/datamule/index.py +0 -0
  44. {datamule-1.6.3 → datamule-1.6.5}/datamule/mapping_dicts/__init__.py +0 -0
  45. {datamule-1.6.3 → datamule-1.6.5}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  46. {datamule-1.6.3 → datamule-1.6.5}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  47. {datamule-1.6.3 → datamule-1.6.5}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  48. {datamule-1.6.3 → datamule-1.6.5}/datamule/package_updater.py +0 -0
  49. {datamule-1.6.3 → datamule-1.6.5}/datamule/portfolio.py +0 -0
  50. {datamule-1.6.3 → datamule-1.6.5}/datamule/portfolio_compression_utils.py +0 -0
  51. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/__init__.py +0 -0
  52. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/infrastructure/__init__.py +0 -0
  53. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  54. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/submissions/__init__.py +0 -0
  55. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/submissions/downloader.py +0 -0
  56. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/submissions/eftsquery.py +0 -0
  57. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/submissions/streamer.py +0 -0
  58. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/submissions/textsearch.py +0 -0
  59. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/utils.py +0 -0
  60. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/xbrl/__init__.py +0 -0
  61. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  62. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  63. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  64. {datamule-1.6.3 → datamule-1.6.5}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  65. {datamule-1.6.3 → datamule-1.6.5}/datamule/seclibrary/__init__.py +0 -0
  66. {datamule-1.6.3 → datamule-1.6.5}/datamule/seclibrary/bq.py +0 -0
  67. {datamule-1.6.3 → datamule-1.6.5}/datamule/seclibrary/downloader.py +0 -0
  68. {datamule-1.6.3 → datamule-1.6.5}/datamule/seclibrary/query.py +0 -0
  69. {datamule-1.6.3 → datamule-1.6.5}/datamule/sheet.py +0 -0
  70. {datamule-1.6.3 → datamule-1.6.5}/datamule/utils/__init__.py +0 -0
  71. {datamule-1.6.3 → datamule-1.6.5}/datamule/utils/construct_submissions_data.py +0 -0
  72. {datamule-1.6.3 → datamule-1.6.5}/datamule.egg-info/SOURCES.txt +0 -0
  73. {datamule-1.6.3 → datamule-1.6.5}/datamule.egg-info/dependency_links.txt +0 -0
  74. {datamule-1.6.3 → datamule-1.6.5}/datamule.egg-info/requires.txt +0 -0
  75. {datamule-1.6.3 → datamule-1.6.5}/datamule.egg-info/top_level.txt +0 -0
  76. {datamule-1.6.3 → datamule-1.6.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.3
3
+ Version: 1.6.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -65,6 +65,7 @@ def clean_efts_hits(hits):
65
65
  class Monitor():
66
66
  def __init__(self):
67
67
  self.accessions = deque(maxlen=50000)
68
+ self.accessions_set = set()
68
69
  self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
69
70
  self.efts_query = EFTSQuery(quiet=True)
70
71
  self.efts_query.limiter = self.ratelimiters['sec.gov']
@@ -250,7 +251,14 @@ class Monitor():
250
251
  new_items = []
251
252
  for item in items:
252
253
  accession = item['accession']
253
- if accession not in self.accessions:
254
+ if accession not in self.accessions_set: # O(1) lookup instead of O(n)
255
+ # Check if deque is at capacity and will evict an item
256
+ if len(self.accessions) == self.accessions.maxlen:
257
+ evicted_accession = self.accessions[0] # Get leftmost item that will be evicted
258
+ self.accessions_set.discard(evicted_accession) # Remove from set
259
+
260
+ # Add to both structures
254
261
  self.accessions.append(accession)
262
+ self.accessions_set.add(accession)
255
263
  new_items.append(item)
256
264
  return new_items
@@ -144,24 +144,57 @@ class Submission:
144
144
 
145
145
  # Handle regular path case
146
146
  else:
147
- # Use exact filename from metadata
148
- document_path = self.path / filename
149
-
150
- if not document_path.exists():
151
- raise FileNotFoundError(f"Document file not found: {document_path}")
152
-
153
- with document_path.open('rb') as f:
154
- content = f.read()
155
-
156
- # Decompress if needed based on filename extension
157
- if is_compressed == 'gzip':
158
- content = gzip.decompress(content)
159
- elif is_compressed == 'zstd':
160
- content = zstd.ZstdDecompressor().decompress(content)
147
+ # Check if path is a tar file (old format)
148
+ if self.path.suffix == '.tar':
149
+ with tarfile.open(self.path, 'r') as tar:
150
+ # Try to extract the file, handling compression
151
+ try:
152
+ content = tar.extractfile(filename).read()
153
+ actual_filename = filename
154
+ except:
155
+ try:
156
+ content = tar.extractfile(filename + '.gz').read()
157
+ actual_filename = filename + '.gz'
158
+ is_compressed = 'gzip'
159
+ except:
160
+ try:
161
+ content = tar.extractfile(filename + '.zst').read()
162
+ actual_filename = filename + '.zst'
163
+ is_compressed = 'zstd'
164
+ except:
165
+ raise FileNotFoundError(f"Document file not found in tar: {filename}")
166
+
167
+ # Decompress if compressed
168
+ if is_compressed == 'gzip':
169
+ content = gzip.decompress(content)
170
+ elif is_compressed == 'zstd':
171
+ content = zstd.ZstdDecompressor().decompress(content)
172
+
173
+ # Decode text files
174
+ if extension in ['.htm', '.html', '.txt', '.xml']:
175
+ content = content.decode('utf-8', errors='replace')
176
+
177
+ document_path = f"{self.path}::{actual_filename}"
161
178
 
162
- # Decode text files
163
- if extension in ['.htm', '.html', '.txt', '.xml']:
164
- content = content.decode('utf-8', errors='replace')
179
+ else:
180
+ # Regular directory case
181
+ document_path = self.path / filename
182
+
183
+ if not document_path.exists():
184
+ raise FileNotFoundError(f"Document file not found: {document_path}")
185
+
186
+ with document_path.open('rb') as f:
187
+ content = f.read()
188
+
189
+ # Decompress if needed based on filename extension
190
+ if is_compressed == 'gzip':
191
+ content = gzip.decompress(content)
192
+ elif is_compressed == 'zstd':
193
+ content = zstd.ZstdDecompressor().decompress(content)
194
+
195
+ # Decode text files
196
+ if extension in ['.htm', '.html', '.txt', '.xml']:
197
+ content = content.decode('utf-8', errors='replace')
165
198
 
166
199
  return Document(
167
200
  type=doc['type'],
@@ -171,7 +204,6 @@ class Submission:
171
204
  accession=self.accession,
172
205
  path=document_path
173
206
  )
174
-
175
207
  def __iter__(self):
176
208
  """Make Submission iterable by yielding all documents."""
177
209
  for idx in range(len(self.metadata.content['documents'])):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.3
3
+ Version: 1.6.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.6.3",
35
+ version="1.6.5",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes