pitedgar 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pitedgar-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 pitedgar contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.4
2
+ Name: pitedgar
3
+ Version: 0.1.0
4
+ Summary: Point-in-time SEC EDGAR financial data pipeline
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: SEC,EDGAR,financial-data,point-in-time,XBRL,backtesting
8
+ Author: Ariel Nacamulli
9
+ Requires-Python: >=3.11
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Financial and Insurance Industry
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Topic :: Office/Business :: Financial
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Typing :: Typed
22
+ Requires-Dist: click
23
+ Requires-Dist: edgartools (>=5.0)
24
+ Requires-Dist: loguru
25
+ Requires-Dist: pandas (>=2.0)
26
+ Requires-Dist: pyarrow
27
+ Requires-Dist: pydantic (>=2.0)
28
+ Requires-Dist: requests
29
+ Requires-Dist: tqdm
30
+ Project-URL: Documentation, https://github.com/arielNacamulli/pitedgar#readme
31
+ Project-URL: Homepage, https://github.com/arielNacamulli/pitedgar
32
+ Project-URL: Repository, https://github.com/arielNacamulli/pitedgar
33
+ Description-Content-Type: text/markdown
34
+
35
+ # pitedgar
36
+
37
+ [![CI](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml/badge.svg)](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml)
38
+ [![PyPI version](https://img.shields.io/pypi/v/pitedgar.svg)](https://pypi.org/project/pitedgar/)
39
+ [![Python versions](https://img.shields.io/pypi/pyversions/pitedgar.svg)](https://pypi.org/project/pitedgar/)
40
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
41
+
42
+ Point-in-time SEC EDGAR financial data pipeline.
43
+
44
+ Downloads SEC EDGAR `companyfacts.zip`, parses XBRL JSON facts into a local
45
+ parquet file, and exposes a query API with **zero look-ahead bias** — every
46
+ value is stamped with the `filed` date (when the data was actually available
47
+ to the market), not the period-end date.
48
+
49
+ ---
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install pitedgar
55
+ # or with Poetry
56
+ poetry install
57
+ ```
58
+
59
+ ---
60
+
61
+ ## Quick start
62
+
63
+ ```python
64
+ from pathlib import Path
65
+ from pitedgar import PitEdgarConfig, build_cik_map, download_bulk, parse_all, PitQuery
66
+
67
+ config = PitEdgarConfig(
68
+ edgar_identity="Mario Rossi mario@example.com", # required by SEC
69
+ data_dir=Path("./data"),
70
+ )
71
+
72
+ # Step 1 — one-shot ticker → CIK mapping
73
+ tickers = ["AAPL", "MSFT", "JPM", "GOOGL"]
74
+ cik_map = build_cik_map(tickers, config)
75
+
76
+ # Step 2 — download ~1.5 GB bulk ZIP (do this periodically, not every run)
77
+ download_bulk(config)
78
+
79
+ # Step 3 — parse JSON → parquet (sub-minute for 500 companies)
80
+ master = parse_all(config, cik_map)
81
+
82
+ # Step 4 — query
83
+ q = PitQuery(config.data_dir / "pit_financials.parquet")
84
+
85
+ # What revenue figure was available to the market on 2022-06-30?
86
+ result = q.as_of(["AAPL", "MSFT"], "us-gaap:Revenues", "2022-06-30")
87
+
88
+ # Full history
89
+ hist = q.history("AAPL", "us-gaap:NetIncomeLoss", freq="A")
90
+
91
+ # Portfolio cross-section signal
92
+ xs = q.cross_section("us-gaap:NetIncomeLoss", "2023-12-31")
93
+ ```
94
+
95
+ ---
96
+
97
+ ## CLI
98
+
99
+ ```bash
100
+ # Resolve tickers (tickers.txt has one ticker per line)
101
+ pitedgar map --tickers tickers.txt --identity "Name name@email.com"
102
+
103
+ # Download bulk ZIP
104
+ pitedgar fetch --identity "Name name@email.com"
105
+
106
+ # Parse to parquet
107
+ pitedgar build --identity "Name name@email.com"
108
+
109
+ # Query a single value
110
+ pitedgar query --ticker AAPL --concept us-gaap:Revenues --as-of 2023-06-30
111
+ ```
112
+
113
+ ---
114
+
115
+ ## Key design decisions
116
+
117
+ | Decision | Rationale |
118
+ |---|---|
119
+ | `filed` as PIT timestamp | The date the filing was submitted to SEC — this is when information became public |
120
+ | Deduplication keeps latest `filed` per `(concept, end)` | Companies sometimes refile restated figures; keep the superseding value |
121
+ | Raw USD values, no scale conversion | SEC reports values as-filed; downstream code applies any needed normalization |
122
+ | Local parquet, no runtime HTTP | Queries run at DataFrame speed with no network dependency |
123
+
124
+ ---
125
+
126
+ ## Supported XBRL concepts (defaults)
127
+
128
+ See `pitedgar.config.DEFAULT_CONCEPTS` for the full list, which includes
129
+ revenues, net income, assets, liabilities, equity, EPS, cash, debt, operating
130
+ cash flow, capex, and R&D expense.
131
+
132
+ ---
133
+
134
+ ## Examples
135
+
136
+ - [`examples/fcf_sp500.py`](examples/fcf_sp500.py) — S&P 500 free cash flow benchmark: fetches constituents, builds the parquet, and queries FCF cross-sections across 20 quarters. Useful as an end-to-end performance reference.
137
+
138
+ ---
139
+
140
+ ## Contributing
141
+
142
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, coding conventions, and the PR process.
143
+
144
+ ---
145
+
146
+ ## License
147
+
148
+ [MIT](LICENSE)
149
+
@@ -0,0 +1,114 @@
1
+ # pitedgar
2
+
3
+ [![CI](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml/badge.svg)](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml)
4
+ [![PyPI version](https://img.shields.io/pypi/v/pitedgar.svg)](https://pypi.org/project/pitedgar/)
5
+ [![Python versions](https://img.shields.io/pypi/pyversions/pitedgar.svg)](https://pypi.org/project/pitedgar/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+
8
+ Point-in-time SEC EDGAR financial data pipeline.
9
+
10
+ Downloads SEC EDGAR `companyfacts.zip`, parses XBRL JSON facts into a local
11
+ parquet file, and exposes a query API with **zero look-ahead bias** — every
12
+ value is stamped with the `filed` date (when the data was actually available
13
+ to the market), not the period-end date.
14
+
15
+ ---
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install pitedgar
21
+ # or with Poetry
22
+ poetry install
23
+ ```
24
+
25
+ ---
26
+
27
+ ## Quick start
28
+
29
+ ```python
30
+ from pathlib import Path
31
+ from pitedgar import PitEdgarConfig, build_cik_map, download_bulk, parse_all, PitQuery
32
+
33
+ config = PitEdgarConfig(
34
+ edgar_identity="Mario Rossi mario@example.com", # required by SEC
35
+ data_dir=Path("./data"),
36
+ )
37
+
38
+ # Step 1 — one-shot ticker → CIK mapping
39
+ tickers = ["AAPL", "MSFT", "JPM", "GOOGL"]
40
+ cik_map = build_cik_map(tickers, config)
41
+
42
+ # Step 2 — download ~1.5 GB bulk ZIP (do this periodically, not every run)
43
+ download_bulk(config)
44
+
45
+ # Step 3 — parse JSON → parquet (sub-minute for 500 companies)
46
+ master = parse_all(config, cik_map)
47
+
48
+ # Step 4 — query
49
+ q = PitQuery(config.data_dir / "pit_financials.parquet")
50
+
51
+ # What revenue figure was available to the market on 2022-06-30?
52
+ result = q.as_of(["AAPL", "MSFT"], "us-gaap:Revenues", "2022-06-30")
53
+
54
+ # Full history
55
+ hist = q.history("AAPL", "us-gaap:NetIncomeLoss", freq="A")
56
+
57
+ # Portfolio cross-section signal
58
+ xs = q.cross_section("us-gaap:NetIncomeLoss", "2023-12-31")
59
+ ```
60
+
61
+ ---
62
+
63
+ ## CLI
64
+
65
+ ```bash
66
+ # Resolve tickers (tickers.txt has one ticker per line)
67
+ pitedgar map --tickers tickers.txt --identity "Name name@email.com"
68
+
69
+ # Download bulk ZIP
70
+ pitedgar fetch --identity "Name name@email.com"
71
+
72
+ # Parse to parquet
73
+ pitedgar build --identity "Name name@email.com"
74
+
75
+ # Query a single value
76
+ pitedgar query --ticker AAPL --concept us-gaap:Revenues --as-of 2023-06-30
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Key design decisions
82
+
83
+ | Decision | Rationale |
84
+ |---|---|
85
+ | `filed` as PIT timestamp | The date the filing was submitted to SEC — this is when information became public |
86
+ | Deduplication keeps latest `filed` per `(concept, end)` | Companies sometimes refile restated figures; keep the superseding value |
87
+ | Raw USD values, no scale conversion | SEC reports values as-filed; downstream code applies any needed normalization |
88
+ | Local parquet, no runtime HTTP | Queries run at DataFrame speed with no network dependency |
89
+
90
+ ---
91
+
92
+ ## Supported XBRL concepts (defaults)
93
+
94
+ See `pitedgar.config.DEFAULT_CONCEPTS` for the full list, which includes
95
+ revenues, net income, assets, liabilities, equity, EPS, cash, debt, operating
96
+ cash flow, capex, and R&D expense.
97
+
98
+ ---
99
+
100
+ ## Examples
101
+
102
+ - [`examples/fcf_sp500.py`](examples/fcf_sp500.py) — S&P 500 free cash flow benchmark: fetches constituents, builds the parquet, and queries FCF cross-sections across 20 quarters. Useful as an end-to-end performance reference.
103
+
104
+ ---
105
+
106
+ ## Contributing
107
+
108
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, coding conventions, and the PR process.
109
+
110
+ ---
111
+
112
+ ## License
113
+
114
+ [MIT](LICENSE)
@@ -0,0 +1,17 @@
1
+ """pitedgar — Point-in-time SEC EDGAR financial data pipeline."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from pitedgar.config import PitEdgarConfig
6
+ from pitedgar.mapping import build_cik_map
7
+ from pitedgar.downloader import download_bulk
8
+ from pitedgar.parser import parse_all
9
+ from pitedgar.query import PitQuery
10
+
11
+ __all__ = [
12
+ "PitEdgarConfig",
13
+ "build_cik_map",
14
+ "download_bulk",
15
+ "parse_all",
16
+ "PitQuery",
17
+ ]
@@ -0,0 +1,71 @@
1
+ """CLI entry points for pitedgar."""
2
+
3
+ from pathlib import Path
4
+
5
+ import click
6
+ import pandas as pd
7
+
8
+ from pitedgar.config import PitEdgarConfig
9
+ from pitedgar.downloader import download_bulk
10
+ from pitedgar.mapping import build_cik_map
11
+ from pitedgar.parser import parse_all
12
+ from pitedgar.query import PitQuery
13
+
14
+
15
+ @click.group()
16
+ def cli() -> None:
17
+ """pitedgar — SEC EDGAR point-in-time financial data pipeline."""
18
+
19
+
20
+ @cli.command("map")
21
+ @click.option("--tickers", "tickers_file", required=True, type=click.Path(exists=True),
22
+ help="Path to a text file with one ticker per line.")
23
+ @click.option("--identity", required=True,
24
+ help='SEC identity string, e.g. "Name name@email.com".')
25
+ @click.option("--data-dir", default="./data", show_default=True,
26
+ help="Directory where outputs are saved.")
27
+ def cmd_map(tickers_file: str, identity: str, data_dir: str) -> None:
28
+ """Resolve tickers to CIK numbers and save the mapping."""
29
+ tickers = Path(tickers_file).read_text().splitlines()
30
+ tickers = [t.strip().upper() for t in tickers if t.strip()]
31
+ config = PitEdgarConfig(edgar_identity=identity, data_dir=Path(data_dir))
32
+ cik_map = build_cik_map(tickers, config)
33
+ click.echo(f"Mapped {len(cik_map)} tickers → {config.data_dir / 'ticker_cik_map.parquet'}")
34
+
35
+
36
+ @cli.command("fetch")
37
+ @click.option("--force", is_flag=True, default=False,
38
+ help="Re-extract even if facts_dir already populated.")
39
+ @click.option("--identity", required=True,
40
+ help='SEC identity string, e.g. "Name name@email.com".')
41
+ @click.option("--data-dir", default="./data", show_default=True)
42
+ def cmd_fetch(force: bool, identity: str, data_dir: str) -> None:
43
+ """Download and extract the SEC companyfacts bulk ZIP."""
44
+ config = PitEdgarConfig(edgar_identity=identity, data_dir=Path(data_dir))
45
+ facts_dir = download_bulk(config, force=force)
46
+ click.echo(f"Facts extracted to {facts_dir}")
47
+
48
+
49
+ @cli.command("build")
50
+ @click.option("--identity", required=True,
51
+ help='SEC identity string, e.g. "Name name@email.com".')
52
+ @click.option("--data-dir", default="./data", show_default=True)
53
+ def cmd_build(identity: str, data_dir: str) -> None:
54
+ """Parse all local JSON facts into the master PIT parquet."""
55
+ config = PitEdgarConfig(edgar_identity=identity, data_dir=Path(data_dir))
56
+ cik_map = pd.read_parquet(config.data_dir / "ticker_cik_map.parquet")
57
+ master = parse_all(config, cik_map)
58
+ click.echo(f"Built master parquet: {len(master):,} rows")
59
+
60
+
61
+ @cli.command("query")
62
+ @click.option("--ticker", required=True)
63
+ @click.option("--concept", required=True, help='e.g. "us-gaap:Revenues"')
64
+ @click.option("--as-of", "as_of", required=True, help="ISO date, e.g. 2023-06-30")
65
+ @click.option("--data-dir", default="./data", show_default=True)
66
+ def cmd_query(ticker: str, concept: str, as_of: str, data_dir: str) -> None:
67
+ """Query the latest PIT value for a ticker/concept as of a date."""
68
+ parquet_path = Path(data_dir) / "pit_financials.parquet"
69
+ q = PitQuery(parquet_path)
70
+ result = q.as_of([ticker.upper()], concept, as_of)
71
+ click.echo(result.to_string(index=False))
@@ -0,0 +1,48 @@
1
+ """Configuration for pitedgar via Pydantic BaseModel."""
2
+
3
+ from pathlib import Path
4
+ from pydantic import BaseModel, model_validator
5
+
6
+ DEFAULT_CONCEPTS = [
7
+ "us-gaap:Revenues",
8
+ "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
9
+ "us-gaap:NetIncomeLoss",
10
+ "us-gaap:Assets",
11
+ "us-gaap:Liabilities",
12
+ "us-gaap:StockholdersEquity",
13
+ "us-gaap:OperatingIncomeLoss",
14
+ "us-gaap:GrossProfit",
15
+ "us-gaap:EarningsPerShareBasic",
16
+ "us-gaap:EarningsPerShareDiluted",
17
+ "us-gaap:CommonStockSharesOutstanding",
18
+ "us-gaap:CashAndCashEquivalentsAtCarryingValue",
19
+ "us-gaap:LongTermDebt",
20
+ "us-gaap:OperatingCashFlow",
21
+ "us-gaap:NetCashProvidedByUsedInOperatingActivities",
22
+ "us-gaap:CapitalExpendituresIncurredButNotYetPaid",
23
+ "us-gaap:ResearchAndDevelopmentExpense",
24
+ ]
25
+
26
+ DEFAULT_FORMS = ["10-K", "10-Q"]
27
+ BULK_ZIP_URL = "https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip"
28
+
29
+
30
+ class PitEdgarConfig(BaseModel):
31
+ edgar_identity: str
32
+ data_dir: Path
33
+ facts_dir: Path | None = None
34
+ zip_url: str = BULK_ZIP_URL
35
+ concepts: list[str] = DEFAULT_CONCEPTS
36
+ forms: list[str] = DEFAULT_FORMS
37
+
38
+ model_config = {"arbitrary_types_allowed": True}
39
+
40
+ @model_validator(mode="after")
41
+ def set_facts_dir(self) -> "PitEdgarConfig":
42
+ if self.facts_dir is None:
43
+ self.facts_dir = self.data_dir / "companyfacts"
44
+ return self
45
+
46
+ def ensure_dirs(self) -> None:
47
+ self.data_dir.mkdir(parents=True, exist_ok=True)
48
+ self.facts_dir.mkdir(parents=True, exist_ok=True)
@@ -0,0 +1,59 @@
1
+ """Step 2: bulk download companyfacts.zip from SEC EDGAR."""
2
+
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ import requests
7
+ from loguru import logger
8
+ from tqdm import tqdm
9
+
10
+ from pitedgar.config import PitEdgarConfig
11
+
12
+
13
+ def download_bulk(config: PitEdgarConfig, force: bool = False) -> Path:
14
+ """Download and extract the SEC companyfacts bulk ZIP.
15
+
16
+ Args:
17
+ config: pipeline configuration.
18
+ force: re-extract even if facts_dir already exists.
19
+
20
+ Returns:
21
+ Path to the extracted facts directory.
22
+ """
23
+ config.ensure_dirs()
24
+ zip_path = config.data_dir / "companyfacts.zip"
25
+
26
+ headers = {"User-Agent": config.edgar_identity}
27
+
28
+ logger.info(f"Downloading {config.zip_url} …")
29
+ with requests.get(config.zip_url, stream=True, headers=headers, timeout=120) as resp:
30
+ resp.raise_for_status()
31
+ total = int(resp.headers.get("Content-Length", 0)) or None
32
+ with (
33
+ open(zip_path, "wb") as fh,
34
+ tqdm(
35
+ total=total,
36
+ unit="B",
37
+ unit_scale=True,
38
+ unit_divisor=1024,
39
+ desc="companyfacts.zip",
40
+ ) as bar,
41
+ ):
42
+ for chunk in resp.iter_content(chunk_size=1 << 20):
43
+ fh.write(chunk)
44
+ bar.update(len(chunk))
45
+
46
+ logger.info(f"ZIP saved: {zip_path}")
47
+
48
+ facts_dir = config.facts_dir
49
+ if force or not facts_dir.exists() or not any(facts_dir.iterdir()):
50
+ logger.info(f"Extracting to {facts_dir} …")
51
+ with zipfile.ZipFile(zip_path, "r") as zf:
52
+ members = zf.namelist()
53
+ for member in tqdm(members, desc="Extracting", unit="file"):
54
+ zf.extract(member, facts_dir)
55
+ logger.info("Extraction complete.")
56
+ else:
57
+ logger.info(f"Facts dir already populated, skipping extraction (use force=True to override).")
58
+
59
+ return facts_dir
@@ -0,0 +1,45 @@
1
+ """Step 1: resolve ticker → CIK via edgartools."""
2
+
3
+ import time
4
+ import edgar
5
+ import pandas as pd
6
+ from loguru import logger
7
+ from pitedgar.config import PitEdgarConfig
8
+
9
+
10
+ def build_cik_map(tickers: list[str], config: PitEdgarConfig) -> pd.DataFrame:
11
+ """Resolve a list of tickers to CIK numbers via edgartools.
12
+
13
+ Saves the result to data_dir/ticker_cik_map.parquet.
14
+ Returns a DataFrame indexed by ticker with columns:
15
+ cik, name, sic, fiscal_year_end, exchange
16
+ """
17
+ edgar.set_identity(config.edgar_identity)
18
+ config.ensure_dirs()
19
+
20
+ records: list[dict] = []
21
+
22
+ for ticker in tickers:
23
+ try:
24
+ company = edgar.Company(ticker)
25
+ cik_padded = f"{company.cik:010d}"
26
+ records.append(
27
+ {
28
+ "ticker": ticker.upper(),
29
+ "cik": cik_padded,
30
+ "name": getattr(company, "name", None),
31
+ "sic": getattr(company, "sic", None),
32
+ "fiscal_year_end": getattr(company, "fiscal_year_end", None),
33
+ "exchange": getattr(company, "exchange", None),
34
+ }
35
+ )
36
+ logger.debug(f"{ticker} → CIK {cik_padded}")
37
+ except Exception as exc:
38
+ logger.warning(f"Could not resolve ticker '{ticker}': {exc}")
39
+ time.sleep(0.1)
40
+
41
+ df = pd.DataFrame(records).set_index("ticker")
42
+ out_path = config.data_dir / "ticker_cik_map.parquet"
43
+ df.to_parquet(out_path)
44
+ logger.info(f"CIK map saved: {out_path} ({len(df)} rows)")
45
+ return df
@@ -0,0 +1,154 @@
1
+ """Step 3: parse local JSON facts into a point-in-time parquet master."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ from loguru import logger
8
+ from tqdm import tqdm
9
+
10
+ from pitedgar.config import PitEdgarConfig
11
+
12
+ # Units to attempt per concept, in priority order.
13
+ # EPS and share concepts use "shares"; everything else USD.
14
+ _SHARE_CONCEPTS = {
15
+ "EarningsPerShareBasic",
16
+ "EarningsPerShareDiluted",
17
+ "CommonStockSharesOutstanding",
18
+ }
19
+
20
+
21
+ def _preferred_units(concept_short: str) -> list[str]:
22
+ if concept_short in _SHARE_CONCEPTS:
23
+ return ["shares", "USD"]
24
+ return ["USD", "shares"]
25
+
26
+
27
+ def parse_company(
28
+ cik_padded: str,
29
+ concepts: list[str],
30
+ facts_dir: Path,
31
+ forms: list[str],
32
+ ) -> pd.DataFrame:
33
+ """Parse a single company's JSON into a tidy PIT DataFrame.
34
+
35
+ Args:
36
+ cik_padded: zero-padded 10-digit CIK string.
37
+ concepts: list of "us-gaap:ConceptName" strings.
38
+ facts_dir: directory containing CIK*.json files.
39
+ forms: filing forms to keep, e.g. ["10-K", "10-Q"].
40
+
41
+ Returns:
42
+ DataFrame with columns: cik, concept, end, filed, val, form, accn
43
+ """
44
+ json_path = facts_dir / f"CIK{cik_padded}.json"
45
+ if not json_path.exists():
46
+ logger.debug(f"No JSON for CIK {cik_padded}, skipping.")
47
+ return pd.DataFrame()
48
+
49
+ with open(json_path, "r", encoding="utf-8") as fh:
50
+ data = json.load(fh)
51
+
52
+ usgaap = data.get("facts", {}).get("us-gaap", {})
53
+ if not usgaap:
54
+ return pd.DataFrame()
55
+
56
+ rows: list[dict] = []
57
+
58
+ for concept_full in concepts:
59
+ # concept_full is like "us-gaap:Revenues"
60
+ parts = concept_full.split(":", 1)
61
+ concept_short = parts[1] if len(parts) == 2 else parts[0]
62
+
63
+ concept_data = usgaap.get(concept_short)
64
+ if concept_data is None:
65
+ continue
66
+
67
+ units_dict: dict = concept_data.get("units", {})
68
+ unit_entries: list[dict] | None = None
69
+
70
+ for unit_key in _preferred_units(concept_short):
71
+ if unit_key in units_dict:
72
+ unit_entries = units_dict[unit_key]
73
+ break
74
+
75
+ if not unit_entries:
76
+ continue
77
+
78
+ for entry in unit_entries:
79
+ form = entry.get("form", "")
80
+ if form not in forms:
81
+ continue
82
+ end = entry.get("end")
83
+ filed = entry.get("filed")
84
+ val = entry.get("val")
85
+ accn = entry.get("accn", "")
86
+ if end is None or filed is None or val is None:
87
+ continue
88
+ rows.append(
89
+ {
90
+ "cik": cik_padded,
91
+ "concept": concept_full,
92
+ "end": end,
93
+ "filed": filed,
94
+ "val": float(val),
95
+ "form": form,
96
+ "accn": accn,
97
+ }
98
+ )
99
+
100
+ if not rows:
101
+ return pd.DataFrame()
102
+
103
+ df = pd.DataFrame(rows)
104
+ df["end"] = pd.to_datetime(df["end"], errors="coerce")
105
+ df["filed"] = pd.to_datetime(df["filed"], errors="coerce")
106
+ df = df.dropna(subset=["end", "filed"])
107
+
108
+ # PIT deduplication: for each (concept, end), keep the most recently filed record.
109
+ df = (
110
+ df.sort_values("filed")
111
+ .drop_duplicates(subset=["concept", "end"], keep="last")
112
+ .sort_values("filed")
113
+ .reset_index(drop=True)
114
+ )
115
+
116
+ return df
117
+
118
+
119
+ def parse_all(config: PitEdgarConfig, cik_map: pd.DataFrame) -> pd.DataFrame:
120
+ """Parse all companies in cik_map into a single PIT master parquet.
121
+
122
+ Args:
123
+ config: pipeline configuration.
124
+ cik_map: DataFrame indexed by ticker with a 'cik' column.
125
+
126
+ Returns:
127
+ Master DataFrame with an additional 'ticker' column.
128
+ """
129
+ config.ensure_dirs()
130
+ all_frames: list[pd.DataFrame] = []
131
+
132
+ for ticker, row in tqdm(cik_map.iterrows(), total=len(cik_map), desc="Parsing"):
133
+ cik_padded = str(row["cik"])
134
+ df = parse_company(
135
+ cik_padded=cik_padded,
136
+ concepts=config.concepts,
137
+ facts_dir=config.facts_dir,
138
+ forms=config.forms,
139
+ )
140
+ if df.empty:
141
+ continue
142
+ df.insert(0, "ticker", ticker)
143
+ all_frames.append(df)
144
+
145
+ if not all_frames:
146
+ logger.warning("No data parsed — check facts_dir and CIK map.")
147
+ return pd.DataFrame()
148
+
149
+ master = pd.concat(all_frames, ignore_index=True)
150
+
151
+ out_path = config.data_dir / "pit_financials.parquet"
152
+ master.to_parquet(out_path, index=False)
153
+ logger.info(f"Master parquet saved: {out_path} ({len(master):,} rows)")
154
+ return master
@@ -0,0 +1,137 @@
1
+ """PIT query API over the master parquet."""
2
+
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+
7
+
8
+ class PitQuery:
9
+ """Query point-in-time financial data from a master parquet file."""
10
+
11
+ def __init__(self, parquet_path: Path) -> None:
12
+ self.data = pd.read_parquet(parquet_path)
13
+ self.data["filed"] = pd.to_datetime(self.data["filed"], errors="coerce")
14
+ self.data["end"] = pd.to_datetime(self.data["end"], errors="coerce")
15
+
16
+ # ------------------------------------------------------------------
17
+ # Public API
18
+ # ------------------------------------------------------------------
19
+
20
+ def as_of(
21
+ self,
22
+ tickers: list[str] | str,
23
+ concept: str,
24
+ as_of_date: str | pd.Timestamp,
25
+ max_staleness_days: int = 180,
26
+ ) -> pd.DataFrame:
27
+ """Last known value of a concept for each ticker as of a given date.
28
+
29
+ Only filings with filed <= as_of_date are considered (no look-ahead).
30
+ Tickers whose most recent filing is older than max_staleness_days
31
+ return NaN for val.
32
+
33
+ Returns DataFrame with columns: ticker, val, filed, end, form
34
+ """
35
+ if isinstance(tickers, str):
36
+ tickers = [tickers]
37
+ as_of_ts = pd.Timestamp(as_of_date)
38
+ cutoff = as_of_ts - pd.Timedelta(days=max_staleness_days)
39
+
40
+ mask = (
41
+ self.data["ticker"].isin(tickers)
42
+ & (self.data["concept"] == concept)
43
+ & (self.data["filed"] <= as_of_ts)
44
+ )
45
+ sub = self.data.loc[mask].copy()
46
+
47
+ if sub.empty:
48
+ return self._empty_result(tickers)
49
+
50
+ # Most recently filed record per ticker
51
+ idx = sub.groupby("ticker")["filed"].idxmax()
52
+ result = sub.loc[idx, ["ticker", "val", "filed", "end", "form"]].copy()
53
+
54
+ # Nullify stale values
55
+ stale = result["filed"] < cutoff
56
+ result.loc[stale, "val"] = float("nan")
57
+
58
+ # Add tickers with no data at all
59
+ missing = set(tickers) - set(result["ticker"])
60
+ if missing:
61
+ filler = pd.DataFrame(
62
+ {"ticker": list(missing), "val": float("nan"), "filed": pd.NaT, "end": pd.NaT, "form": None}
63
+ )
64
+ result = pd.concat([result, filler], ignore_index=True)
65
+
66
+ return result.reset_index(drop=True)
67
+
68
+ def history(
69
+ self,
70
+ ticker: str,
71
+ concept: str,
72
+ start_date: str | None = None,
73
+ end_date: str | None = None,
74
+ freq: str = "Q",
75
+ ) -> pd.DataFrame:
76
+ """Point-in-time history of a concept for a single ticker.
77
+
78
+ Args:
79
+ ticker: company ticker symbol.
80
+ concept: XBRL concept, e.g. "us-gaap:Revenues".
81
+ start_date: filter end >= start_date (ISO string).
82
+ end_date: filter end <= end_date (ISO string).
83
+ freq: "Q" → 10-Q only, "A" → 10-K only, anything else → all.
84
+
85
+ Returns DataFrame with columns: ticker, concept, end, filed, val, form, accn
86
+ """
87
+ mask = (self.data["ticker"] == ticker) & (self.data["concept"] == concept)
88
+ sub = self.data.loc[mask].copy()
89
+
90
+ if freq == "Q":
91
+ sub = sub[sub["form"] == "10-Q"]
92
+ elif freq == "A":
93
+ sub = sub[sub["form"] == "10-K"]
94
+
95
+ if start_date is not None:
96
+ sub = sub[sub["end"] >= pd.Timestamp(start_date)]
97
+ if end_date is not None:
98
+ sub = sub[sub["end"] <= pd.Timestamp(end_date)]
99
+
100
+ return sub.sort_values("filed").reset_index(drop=True)
101
+
102
+ def cross_section(
103
+ self,
104
+ concept: str,
105
+ as_of_date: str,
106
+ tickers: list[str] | None = None,
107
+ max_staleness_days: int = 180,
108
+ ) -> pd.DataFrame:
109
+ """Cross-section of all (or a subset of) tickers for a concept at a date.
110
+
111
+ Useful for building portfolio signals.
112
+
113
+ Returns DataFrame with columns: ticker, val, filed, end, form
114
+ """
115
+ universe = tickers if tickers is not None else self.data["ticker"].unique().tolist()
116
+ return self.as_of(
117
+ tickers=universe,
118
+ concept=concept,
119
+ as_of_date=as_of_date,
120
+ max_staleness_days=max_staleness_days,
121
+ )
122
+
123
+ # ------------------------------------------------------------------
124
+ # Helpers
125
+ # ------------------------------------------------------------------
126
+
127
+ @staticmethod
128
+ def _empty_result(tickers: list[str]) -> pd.DataFrame:
129
+ return pd.DataFrame(
130
+ {
131
+ "ticker": tickers,
132
+ "val": float("nan"),
133
+ "filed": pd.NaT,
134
+ "end": pd.NaT,
135
+ "form": None,
136
+ }
137
+ )
@@ -0,0 +1,46 @@
1
+ [tool.poetry]
2
+ name = "pitedgar"
3
+ version = "0.1.0"
4
+ description = "Point-in-time SEC EDGAR financial data pipeline"
5
+ authors = ["Ariel Nacamulli"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/arielNacamulli/pitedgar"
9
+ repository = "https://github.com/arielNacamulli/pitedgar"
10
+ documentation = "https://github.com/arielNacamulli/pitedgar#readme"
11
+ keywords = ["SEC", "EDGAR", "financial-data", "point-in-time", "XBRL", "backtesting"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Financial and Insurance Industry",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Office/Business :: Financial",
21
+ "Topic :: Scientific/Engineering",
22
+ "Typing :: Typed",
23
+ ]
24
+ packages = [{ include = "pitedgar" }]
25
+
26
+ [tool.poetry.dependencies]
27
+ python = ">=3.11"
28
+ edgartools = ">=5.0"
29
+ pandas = ">=2.0"
30
+ pyarrow = "*"
31
+ requests = "*"
32
+ tqdm = "*"
33
+ loguru = "*"
34
+ pydantic = ">=2.0"
35
+ click = "*"
36
+
37
+ [tool.poetry.group.dev.dependencies]
38
+ pytest = "*"
39
+ pytest-mock = "*"
40
+
41
+ [tool.poetry.scripts]
42
+ pitedgar = "pitedgar.cli:cli"
43
+
44
+ [build-system]
45
+ requires = ["poetry-core"]
46
+ build-backend = "poetry.core.masonry.api"