datamule 0.427__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {datamule-0.427 → datamule-1.0.3}/PKG-INFO +1 -2
  2. datamule-1.0.3/datamule/__init__.py +47 -0
  3. datamule-1.0.3/datamule/book/__init__.py +0 -0
  4. datamule-1.0.3/datamule/book/book.py +34 -0
  5. datamule-1.0.3/datamule/book/eftsquery.py +127 -0
  6. datamule-1.0.3/datamule/book/xbrl_retriever.py +88 -0
  7. datamule-1.0.3/datamule/config.py +29 -0
  8. datamule-1.0.3/datamule/document.py +279 -0
  9. {datamule-0.427 → datamule-1.0.3}/datamule/downloader/downloader.py +18 -8
  10. {datamule-0.427 → datamule-1.0.3}/datamule/downloader/premiumdownloader.py +8 -5
  11. datamule-1.0.3/datamule/mapping_dicts/txt_mapping_dicts.py +234 -0
  12. datamule-1.0.3/datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  13. {datamule-0.427 → datamule-1.0.3}/datamule/monitor.py +57 -10
  14. datamule-1.0.3/datamule/portfolio.py +106 -0
  15. {datamule-0.427 → datamule-1.0.3}/datamule/submission.py +16 -7
  16. {datamule-0.427 → datamule-1.0.3}/datamule.egg-info/PKG-INFO +1 -2
  17. {datamule-0.427 → datamule-1.0.3}/datamule.egg-info/SOURCES.txt +8 -17
  18. {datamule-0.427 → datamule-1.0.3}/datamule.egg-info/requires.txt +3 -9
  19. datamule-1.0.3/setup.py +47 -0
  20. datamule-0.427/datamule/__init__.py +0 -71
  21. datamule-0.427/datamule/dataset_builder/dataset_builder.py +0 -259
  22. datamule-0.427/datamule/document.py +0 -130
  23. datamule-0.427/datamule/parser/document_parsing/basic_10k_parser.py +0 -82
  24. datamule-0.427/datamule/parser/document_parsing/basic_10q_parser.py +0 -73
  25. datamule-0.427/datamule/parser/document_parsing/basic_13d_parser.py +0 -58
  26. datamule-0.427/datamule/parser/document_parsing/basic_13g_parser.py +0 -61
  27. datamule-0.427/datamule/parser/document_parsing/basic_8k_parser.py +0 -84
  28. datamule-0.427/datamule/parser/document_parsing/form_d_parser.py +0 -70
  29. datamule-0.427/datamule/parser/document_parsing/generalized_item_parser.py +0 -78
  30. datamule-0.427/datamule/parser/document_parsing/helper.py +0 -75
  31. datamule-0.427/datamule/parser/document_parsing/information_table_parser_13fhr.py +0 -41
  32. datamule-0.427/datamule/parser/document_parsing/insider_trading_parser.py +0 -158
  33. datamule-0.427/datamule/parser/document_parsing/mappings.py +0 -95
  34. datamule-0.427/datamule/parser/document_parsing/n_port_p_parser.py +0 -70
  35. datamule-0.427/datamule/parser/document_parsing/sec_parser.py +0 -73
  36. datamule-0.427/datamule/parser/document_parsing/sgml_parser.py +0 -94
  37. datamule-0.427/datamule/parser/sgml_parsing/sgml_parser_cy.c +0 -20006
  38. datamule-0.427/datamule/portfolio.py +0 -31
  39. datamule-0.427/setup.py +0 -93
  40. {datamule-0.427 → datamule-1.0.3}/datamule/data/company_former_names.csv +0 -0
  41. {datamule-0.427 → datamule-1.0.3}/datamule/data/company_metadata.csv +0 -0
  42. {datamule-0.427 → datamule-1.0.3}/datamule/data/company_tickers.csv +0 -0
  43. {datamule-0.427 → datamule-1.0.3}/datamule/data/sec-glossary.csv +0 -0
  44. {datamule-0.427 → datamule-1.0.3}/datamule/data/xbrl_descriptions.csv +0 -0
  45. {datamule-0.427 → datamule-1.0.3}/datamule/helper.py +0 -0
  46. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/__init__.py +0 -0
  47. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/helper.py +0 -0
  48. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot.py +0 -0
  49. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/__init__.py +0 -0
  50. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/server.py +0 -0
  51. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -0
  52. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -0
  53. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -0
  54. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -0
  55. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -0
  56. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -0
  57. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -0
  58. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -0
  59. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -0
  60. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -0
  61. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -0
  62. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/search.py +0 -0
  63. {datamule-0.427 → datamule-1.0.3}/datamule/mulebot/tools.py +0 -0
  64. {datamule-0.427 → datamule-1.0.3}/datamule/packageupdater.py +0 -0
  65. {datamule-0.427 → datamule-1.0.3}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-0.427 → datamule-1.0.3}/datamule.egg-info/top_level.txt +0 -0
  67. {datamule-0.427 → datamule-1.0.3}/setup.cfg +0 -0
@@ -1,10 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.427
3
+ Version: 1.0.3
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
7
7
  Provides-Extra: mulebot
8
8
  Provides-Extra: mulebot_server
9
- Provides-Extra: dataset_builder
10
9
  Provides-Extra: all
@@ -0,0 +1,47 @@
1
+ from .downloader.downloader import Downloader
2
+ from .downloader.premiumdownloader import PremiumDownloader
3
+ from .monitor import Monitor
4
+ from .packageupdater import PackageUpdater
5
+ from .submission import Submission
6
+ from .portfolio import Portfolio
7
+ from .document import Document
8
+ from secsgml import parse_sgml_submission
9
+ from .helper import load_package_csv, load_package_dataset
10
+ from .config import Config
11
+
12
+
13
+ # Keep the notebook environment setup
14
+ def _is_notebook_env():
15
+ """Check if the code is running in a Jupyter or Colab environment."""
16
+ try:
17
+ shell = get_ipython().__class__.__name__
18
+ return shell in ('ZMQInteractiveShell', 'Shell', 'Google.Colab')
19
+ except NameError:
20
+ return False
21
+
22
+ from functools import lru_cache
23
+
24
+ @lru_cache(maxsize=1)
25
+ def _setup_notebook_env():
26
+ """Setup Jupyter/Colab-specific configurations if needed."""
27
+ if _is_notebook_env():
28
+ import nest_asyncio
29
+ nest_asyncio.apply()
30
+
31
+ # Set up notebook environment
32
+ _setup_notebook_env()
33
+
34
+ __all__ = [
35
+ 'Downloader',
36
+ 'PremiumDownloader',
37
+ 'load_package_csv',
38
+ 'load_package_dataset',
39
+ 'Filing',
40
+ 'Portfolio',
41
+ 'Monitor',
42
+ 'PackageUpdater',
43
+ 'Submission',
44
+ 'Document',
45
+ 'parse_sgml_submission',
46
+ 'Config'
47
+ ]
File without changes
@@ -0,0 +1,34 @@
1
+ # Streams data rather than downloading it.
2
+ # additional functionality such as query by xbrl, and other db
3
+ # also this is basically our experimental rework of portfolio w/o disturbing existing users
4
+ # this is highly experimental and may not work as expected
5
+ # only for datamule source
6
+ # likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
7
+ # wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
8
+
9
+ # example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
10
+
11
+ # hmm do we go for sql esq or not.
12
+ # I think we do.
13
+ # i think we remove cik, ticker, sic, etc and just have a query object
14
+ # should be sql esq so users can use it easily w/o learnign new syntax
15
+
16
+ # WHERE submission_type = '10-K'
17
+ # AND us-gaap:ResearchAndDevelopmentExpense > 0
18
+ # AND dei:debt_to_equity < 2
19
+ # AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
20
+ # AND CIK in (123, 456, 789)
21
+ # AND SIC in (123, 456, 789)
22
+ # AND ticker in ('AAPL', 'GOOGL', 'AMZN')
23
+ # AND document_type = 'EX-99.1' # to select attachments
24
+
25
+ from .eftsquery import EFTSQuery
26
+
27
+
28
+ class Book():
29
+ def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
30
+ xbrl_query={},
31
+ metadata_callback=None,
32
+ document_callback=None,):
33
+ # grabs data and processes it
34
+ pass
@@ -0,0 +1,127 @@
1
+ import asyncio
2
+ import aiohttp
3
+ from tqdm import tqdm
4
+ from datetime import datetime
5
+ from urllib.parse import urlencode
6
+ import time
7
+
8
+ class PreciseRateLimiter:
9
+ def __init__(self, rate=10, interval=1.0):
10
+ self.rate = rate # requests per interval
11
+ self.interval = interval # in seconds
12
+ self.token_time = self.interval / self.rate # time per token
13
+ self.last_time = time.time()
14
+ self.lock = asyncio.Lock()
15
+
16
+ async def acquire(self):
17
+ async with self.lock:
18
+ now = time.time()
19
+ wait_time = self.last_time + self.token_time - now
20
+ if wait_time > 0:
21
+ await asyncio.sleep(wait_time)
22
+ self.last_time = time.time()
23
+ return True
24
+
25
+ class EFTSQuery:
26
+ def __init__(self):
27
+ self.headers = {
28
+ 'User-Agent': 'Your Name yourname@email.com',
29
+ 'Accept-Encoding': 'gzip, deflate',
30
+ 'Host': 'efts.sec.gov'
31
+ }
32
+ self.session = None
33
+ self.limiter = PreciseRateLimiter(10)
34
+
35
+ async def __aenter__(self):
36
+ if not self.session:
37
+ self.session = aiohttp.ClientSession(headers=self.headers)
38
+ return self
39
+
40
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
41
+ if self.session:
42
+ await self.session.close()
43
+ self.session = None
44
+
45
+ async def _fetch_json(self, url):
46
+ await self.limiter.acquire()
47
+ try:
48
+ async with self.session.get(url) as response:
49
+ if response.status == 429:
50
+ await asyncio.sleep(61)
51
+ return await self._fetch_json(url)
52
+ return await response.json()
53
+ except Exception as e:
54
+ print(f"Error fetching {url}: {str(e)}")
55
+ return None
56
+
57
+ async def _get_accession_numbers(self, base_url):
58
+ data = await self._fetch_json(f"{base_url}&from=0&size=1")
59
+ if not data or 'hits' not in data:
60
+ return []
61
+
62
+ total_hits = data['hits']['total']['value']
63
+ if not total_hits:
64
+ return []
65
+
66
+ accession_numbers = []
67
+ start = 0
68
+ page_size = 100
69
+ batch_size = 10 # Number of concurrent requests
70
+
71
+ with tqdm(total=total_hits) as pbar:
72
+ while start < total_hits:
73
+ tasks = []
74
+ for i in range(batch_size):
75
+ if start + i * page_size >= total_hits:
76
+ break
77
+ url = f"{base_url}&from={start + i * page_size}&size={page_size}"
78
+ tasks.append(self._fetch_json(url))
79
+
80
+ if not tasks:
81
+ break
82
+
83
+ results = await asyncio.gather(*tasks)
84
+
85
+ for data in results:
86
+ if data and 'hits' in data:
87
+ hits = data['hits']['hits']
88
+ batch_numbers = [
89
+ f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
90
+ for hit in hits
91
+ ]
92
+ accession_numbers.extend(batch_numbers)
93
+ pbar.update(len(hits))
94
+
95
+ start += batch_size * page_size
96
+
97
+ return accession_numbers
98
+
99
+ def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
100
+ async def _download():
101
+ async with self as downloader:
102
+ params = {}
103
+
104
+ if cik:
105
+ params['ciks'] = str(cik).zfill(10)
106
+
107
+ if submission_type:
108
+ params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
109
+
110
+ if isinstance(filing_date, list):
111
+ dates = [(d, d) for d in filing_date]
112
+ elif isinstance(filing_date, tuple):
113
+ dates = [filing_date]
114
+ else:
115
+ date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
116
+ start, end = date_str.split(',')
117
+ dates = [(start, end)]
118
+
119
+ params['startdt'], params['enddt'] = dates[0]
120
+
121
+ if search_text:
122
+ params['q'] = f'"{search_text}"'
123
+
124
+ base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
125
+ return await self._get_accession_numbers(base_url)
126
+
127
+ return asyncio.run(_download())
@@ -0,0 +1,88 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import time
4
+
5
+ class PreciseRateLimiter:
6
+ def __init__(self, rate=10, interval=1.0):
7
+ self.rate = rate
8
+ self.interval = interval
9
+ self.token_time = self.interval / self.rate
10
+ self.last_time = time.time()
11
+ self.lock = asyncio.Lock()
12
+
13
+ async def acquire(self):
14
+ async with self.lock:
15
+ now = time.time()
16
+ wait_time = self.last_time + self.token_time - now
17
+ if wait_time > 0:
18
+ await asyncio.sleep(wait_time)
19
+ self.last_time = time.time()
20
+ return True
21
+
22
+ class XBRLRetriever:
23
+ def __init__(self):
24
+ self.base_url = "https://data.sec.gov/api/xbrl/frames"
25
+ self.headers = {
26
+ 'User-Agent': 'Your Name yourname@email.com',
27
+ 'Accept-Encoding': 'gzip, deflate',
28
+ 'Host': 'data.sec.gov'
29
+ }
30
+ self.session = None
31
+ self.limiter = PreciseRateLimiter(10)
32
+
33
+ async def __aenter__(self):
34
+ if not self.session:
35
+ self.session = aiohttp.ClientSession(headers=self.headers)
36
+ return self
37
+
38
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
39
+ if self.session:
40
+ await self.session.close()
41
+ self.session = None
42
+
43
+ async def _fetch_json(self, url):
44
+ await self.limiter.acquire()
45
+ try:
46
+ async with self.session.get(url) as response:
47
+ if response.status == 429:
48
+ await asyncio.sleep(61)
49
+ return await self._fetch_json(url)
50
+ elif response.status == 200:
51
+ return await response.json()
52
+ else:
53
+ print(f"Error {response.status} for URL: {url}")
54
+ return None
55
+ except Exception as e:
56
+ print(f"Error fetching {url}: {str(e)}")
57
+ return None
58
+
59
+ def _build_url(self, params):
60
+ taxonomy = params.get('taxonomy')
61
+ concept = params.get('concept')
62
+ unit = params.get('unit')
63
+ period = params.get('period')
64
+
65
+ if not all([taxonomy, concept, unit, period]):
66
+ raise ValueError("Missing required parameters")
67
+
68
+ return f"{self.base_url}/{taxonomy}/{concept}/{unit}/{period}.json"
69
+
70
+ async def _get_xbrl_data(self, params_list):
71
+ tasks = []
72
+ urls = {}
73
+
74
+ for params in params_list:
75
+ url = self._build_url(params)
76
+ urls[url] = params
77
+ tasks.append(self._fetch_json(url))
78
+
79
+ results = await asyncio.gather(*tasks)
80
+
81
+ return {url: result for url, result in zip(urls.keys(), results) if result is not None}
82
+
83
+ def get_xbrl_frames(self, params_list):
84
+ async def _download():
85
+ async with self as downloader:
86
+ return await self._get_xbrl_data(params_list)
87
+
88
+ return asyncio.run(_download())
@@ -0,0 +1,29 @@
1
+ import json
2
+ import os
3
+
4
+ class Config:
5
+ def __init__(self):
6
+ self.config_path = os.path.expanduser("~/.datamule/config.json")
7
+ self._ensure_config_exists()
8
+
9
+ def _ensure_config_exists(self):
10
+ os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
11
+ if not os.path.exists(self.config_path):
12
+ self._save_config({"default_source": None})
13
+
14
+ def _save_config(self, config):
15
+ with open(self.config_path, 'w') as f:
16
+ json.dump(config, f)
17
+
18
+ def set_default_source(self, source):
19
+ config = self._load_config()
20
+ config["default_source"] = source
21
+ self._save_config(config)
22
+
23
+ def get_default_source(self):
24
+ config = self._load_config()
25
+ return config.get("default_source")
26
+
27
+ def _load_config(self):
28
+ with open(self.config_path) as f:
29
+ return json.load(f)
@@ -0,0 +1,279 @@
1
+ import json
2
+ import csv
3
+ from .helper import convert_to_dashed_accession
4
+ import re
5
+ from doc2dict import xml2dict, txt2dict, dict2dict
6
+ from doc2dict.mapping import flatten_hierarchy
7
+ from .mapping_dicts import txt_mapping_dicts
8
+ from .mapping_dicts import xml_mapping_dicts
9
+ from selectolax.parser import HTMLParser
10
+
11
+ class Document:
12
+ def __init__(self, type, filename):
13
+ self.type = type
14
+ self.path = filename
15
+
16
+ self.data = None
17
+ self.content = None
18
+
19
+
20
+ def load_content(self,encoding='utf-8'):
21
+ with open(self.path, 'r',encoding=encoding) as f:
22
+ self.content = f.read()
23
+
24
+ def _load_text_content(self):
25
+ with open(self.path) as f:
26
+ return f.read().translate(str.maketrans({
27
+ '\xa0': ' ', '\u2003': ' ',
28
+ '\u2018': "'", '\u2019': "'",
29
+ '\u201c': '"', '\u201d': '"'
30
+ }))
31
+
32
+ # will deprecate this when we add html2dict
33
+ def _load_html_content(self):
34
+ with open(self.path,'rb') as f:
35
+ parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
36
+
37
+ # Remove hidden elements first
38
+ hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
39
+ for node in hidden_nodes:
40
+ node.decompose()
41
+
42
+ blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
43
+ lines = []
44
+ current_line = []
45
+
46
+ def flush_line():
47
+ if current_line:
48
+ # Don't add spaces between adjacent spans
49
+ lines.append(''.join(current_line))
50
+ current_line.clear()
51
+
52
+ for node in parser.root.traverse(include_text=True):
53
+ if node.tag in ('script', 'style', 'css'):
54
+ continue
55
+
56
+ if node.tag in blocks:
57
+ flush_line()
58
+ lines.append('')
59
+
60
+ if node.text_content:
61
+ text = node.text_content.strip()
62
+ if text:
63
+ if node.tag in blocks:
64
+ flush_line()
65
+ lines.append(text)
66
+ lines.append('')
67
+ else:
68
+ # Only add space if nodes aren't directly adjacent
69
+ if current_line and not current_line[-1].endswith(' '):
70
+ if node.prev and node.prev.text_content:
71
+ if node.parent != node.prev.parent or node.prev.next != node:
72
+ current_line.append(' ')
73
+ current_line.append(text)
74
+
75
+ flush_line()
76
+
77
+ text = '\n'.join(lines)
78
+ while '\n\n\n' in text:
79
+ text = text.replace('\n\n\n', '\n\n')
80
+
81
+ return text.translate(str.maketrans({
82
+ '\xa0': ' ', '\u2003': ' ',
83
+ '\u2018': "'", '\u2019': "'",
84
+ '\u201c': '"', '\u201d': '"'
85
+ }))
86
+
87
+ def _load_file_content(self):
88
+ if self.path.suffix =='.txt':
89
+ self.content = self._load_text_content()
90
+ elif self.path.suffix in ['.html','.htm']:
91
+ self.content = self._load_html_content()
92
+ else:
93
+ raise ValueError(f"Unsupported file type: {self.path.suffix}")
94
+
95
+
96
+ def contains_string(self, pattern):
97
+ """Currently only works for .htm, .html, and .txt files"""
98
+ if self.path.suffix in ['.htm', '.html', '.txt']:
99
+ if self.content is None:
100
+ self.content = self._load_file_content(self.path)
101
+ return bool(re.search(pattern, self.content))
102
+ return False
103
+
104
+ # Note: this method will be heavily modified in the future
105
+ def parse(self):
106
+ mapping_dict = None
107
+
108
+ if self.path.suffix == '.xml':
109
+ if self.type in ['3', '4', '5']:
110
+ mapping_dict = xml_mapping_dicts.dict_345
111
+
112
+ self.load_content()
113
+ self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
114
+ # will deprecate this when we add html2dict
115
+ elif self.path.suffix in ['.htm', '.html','.txt']:
116
+ self._load_file_content()
117
+
118
+ if self.type == '10-K':
119
+ mapping_dict = txt_mapping_dicts.dict_10k
120
+ elif self.type == '10-Q':
121
+ mapping_dict = txt_mapping_dicts.dict_10q
122
+ elif self.type == '8-K':
123
+ mapping_dict = txt_mapping_dicts.dict_8k
124
+ elif self.type == 'SC 13D':
125
+ mapping_dict = txt_mapping_dicts.dict_13d
126
+ elif self.type == 'SC 13G':
127
+ mapping_dict = txt_mapping_dicts.dict_13g
128
+
129
+ self.data = {}
130
+ self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
131
+ return self.data
132
+
133
+ def write_json(self, output_filename=None):
134
+ if not self.data:
135
+ self.parse()
136
+
137
+ if output_filename is None:
138
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
139
+
140
+ with open(output_filename, 'w',encoding='utf-8') as f:
141
+ json.dump(self.data, f, indent=2)
142
+
143
+ def write_csv(self, output_filename=None, accession_number=None):
144
+ self.parse()
145
+
146
+ if output_filename is None:
147
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
148
+
149
+ with open(output_filename, 'w', newline='') as csvfile:
150
+ if not self.data:
151
+ return output_filename
152
+
153
+ has_document = any('document' in item for item in self.data)
154
+
155
+ if has_document and 'document' in self.data:
156
+ writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
157
+ writer.writeheader()
158
+ flattened = self._flatten_dict(self.data['document'])
159
+ for section, text in flattened.items():
160
+ writer.writerow({'section': section, 'text': text})
161
+ else:
162
+ fieldnames = list(self.data[0].keys())
163
+ if accession_number:
164
+ fieldnames.append('Accession Number')
165
+ writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
166
+ writer.writeheader()
167
+ for row in self.data:
168
+ if accession_number:
169
+ row['Accession Number'] = convert_to_dashed_accession(accession_number)
170
+ writer.writerow(row)
171
+
172
+ return output_filename
173
+
174
+ def _document_to_section_text(self, document_data, parent_key=''):
175
+ items = []
176
+
177
+ if isinstance(document_data, dict):
178
+ for key, value in document_data.items():
179
+ # Build the section name
180
+ section = f"{parent_key}_{key}" if parent_key else key
181
+
182
+ # If the value is a dict, recurse
183
+ if isinstance(value, dict):
184
+ items.extend(self._document_to_section_text(value, section))
185
+ # If it's a list, handle each item
186
+ elif isinstance(value, list):
187
+ for i, item in enumerate(value):
188
+ if isinstance(item, dict):
189
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
190
+ else:
191
+ items.append({
192
+ 'section': f"{section}_{i+1}",
193
+ 'text': str(item)
194
+ })
195
+ # Base case - add the item
196
+ else:
197
+ items.append({
198
+ 'section': section,
199
+ 'text': str(value)
200
+ })
201
+
202
+ return items
203
+
204
+ # we'll modify this for every dict
205
+ def _flatten_dict(self, d, parent_key=''):
206
+ items = {}
207
+
208
+ if isinstance(d, list):
209
+ return [self._flatten_dict(item) for item in d]
210
+
211
+ for k, v in d.items():
212
+ new_key = f"{parent_key}_{k}" if parent_key else k
213
+
214
+ if isinstance(v, dict):
215
+ items.update(self._flatten_dict(v, new_key))
216
+ else:
217
+ items[new_key] = str(v)
218
+
219
+ return items
220
+
221
+ # this will all have to be changed. default will be to flatten everything
222
+ def __iter__(self):
223
+ if not self.data:
224
+ self.parse()
225
+
226
+ # Let's remove XML iterable for now
227
+
228
+ # Handle text-based documents
229
+ if self.path.suffix in ['.txt', '.htm', '.html']:
230
+ document_data = self.data
231
+ if not document_data:
232
+ return iter([])
233
+
234
+ # Find highest hierarchy level from mapping dict
235
+ highest_hierarchy = float('inf')
236
+ section_type = None
237
+
238
+ if self.type in ['10-K', '10-Q']:
239
+ mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
240
+ elif self.type == '8-K':
241
+ mapping_dict = txt_mapping_dicts.dict_8k
242
+ elif self.type == 'SC 13D':
243
+ mapping_dict = txt_mapping_dicts.dict_13d
244
+ elif self.type == 'SC 13G':
245
+ mapping_dict = txt_mapping_dicts.dict_13g
246
+ else:
247
+ return iter([])
248
+
249
+ # Find section type with highest hierarchy number
250
+ highest_hierarchy = -1 # Start at -1 to find highest
251
+ for mapping in mapping_dict['rules']['mappings']:
252
+ if mapping.get('hierarchy') is not None:
253
+ if mapping['hierarchy'] > highest_hierarchy:
254
+ highest_hierarchy = mapping['hierarchy']
255
+ section_type = mapping['name']
256
+
257
+ if not section_type:
258
+ return iter([])
259
+
260
+ # Extract sections of the identified type
261
+ def find_sections(data, target_type):
262
+ sections = []
263
+ if isinstance(data, dict):
264
+ if data.get('type') == target_type:
265
+ sections.append({
266
+ 'item': data.get('text', ''),
267
+ 'text': flatten_hierarchy(data.get('content', []))
268
+ })
269
+ for value in data.values():
270
+ if isinstance(value, (dict, list)):
271
+ sections.extend(find_sections(value, target_type))
272
+ elif isinstance(data, list):
273
+ for item in data:
274
+ sections.extend(find_sections(item, target_type))
275
+ return sections
276
+
277
+ return iter(find_sections(document_data, section_type))
278
+
279
+ return iter([])