PyPI - pitedgar - Versions diffs - 0.1.0__tar.gz - Mend

pitedgar 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

pitedgar-0.1.0/LICENSE +21 -0
pitedgar-0.1.0/PKG-INFO +149 -0
pitedgar-0.1.0/README.md +114 -0
pitedgar-0.1.0/pitedgar/__init__.py +17 -0
pitedgar-0.1.0/pitedgar/cli.py +71 -0
pitedgar-0.1.0/pitedgar/config.py +48 -0
pitedgar-0.1.0/pitedgar/downloader.py +59 -0
pitedgar-0.1.0/pitedgar/mapping.py +45 -0
pitedgar-0.1.0/pitedgar/parser.py +154 -0
pitedgar-0.1.0/pitedgar/query.py +137 -0
pitedgar-0.1.0/pyproject.toml +46 -0

pitedgar-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 pitedgar contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pitedgar-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,149 @@
+Metadata-Version: 2.4
+Name: pitedgar
+Version: 0.1.0
+Summary: Point-in-time SEC EDGAR financial data pipeline
+License: MIT
+License-File: LICENSE
+Keywords: SEC,EDGAR,financial-data,point-in-time,XBRL,backtesting
+Author: Ariel Nacamulli
+Requires-Python: >=3.11
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Financial and Insurance Industry
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Topic :: Office/Business :: Financial
+Classifier: Topic :: Scientific/Engineering
+Classifier: Typing :: Typed
+Requires-Dist: click
+Requires-Dist: edgartools (>=5.0)
+Requires-Dist: loguru
+Requires-Dist: pandas (>=2.0)
+Requires-Dist: pyarrow
+Requires-Dist: pydantic (>=2.0)
+Requires-Dist: requests
+Requires-Dist: tqdm
+Project-URL: Documentation, https://github.com/arielNacamulli/pitedgar#readme
+Project-URL: Homepage, https://github.com/arielNacamulli/pitedgar
+Project-URL: Repository, https://github.com/arielNacamulli/pitedgar
+Description-Content-Type: text/markdown
+# pitedgar
+[![CI](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml/badge.svg)](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml)
+[![PyPI version](https://img.shields.io/pypi/v/pitedgar.svg)](https://pypi.org/project/pitedgar/)
+[![Python versions](https://img.shields.io/pypi/pyversions/pitedgar.svg)](https://pypi.org/project/pitedgar/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+Point-in-time SEC EDGAR financial data pipeline.
+Downloads SEC EDGAR `companyfacts.zip`, parses XBRL JSON facts into a local
+parquet file, and exposes a query API with **zero look-ahead bias** — every
+value is stamped with the `filed` date (when the data was actually available
+to the market), not the period-end date.
+---
+## Installation
+```bash
+pip install pitedgar
+# or with Poetry
+poetry install
+```
+---
+## Quick start
+```python
+from pathlib import Path
+from pitedgar import PitEdgarConfig, build_cik_map, download_bulk, parse_all, PitQuery
+config = PitEdgarConfig(
+    edgar_identity="Mario Rossi mario@example.com",  # required by SEC
+    data_dir=Path("./data"),
+)
+# Step 1 — one-shot ticker → CIK mapping
+tickers = ["AAPL", "MSFT", "JPM", "GOOGL"]
+cik_map = build_cik_map(tickers, config)
+# Step 2 — download ~1.5 GB bulk ZIP (do this periodically, not every run)
+download_bulk(config)
+# Step 3 — parse JSON → parquet (sub-minute for 500 companies)
+master = parse_all(config, cik_map)
+# Step 4 — query
+q = PitQuery(config.data_dir / "pit_financials.parquet")
+# What revenue figure was available to the market on 2022-06-30?
+result = q.as_of(["AAPL", "MSFT"], "us-gaap:Revenues", "2022-06-30")
+# Full history
+hist = q.history("AAPL", "us-gaap:NetIncomeLoss", freq="A")
+# Portfolio cross-section signal
+xs = q.cross_section("us-gaap:NetIncomeLoss", "2023-12-31")
+```
+---
+## CLI
+```bash
+# Resolve tickers (tickers.txt has one ticker per line)
+pitedgar map --tickers tickers.txt --identity "Name name@email.com"
+# Download bulk ZIP
+pitedgar fetch --identity "Name name@email.com"
+# Parse to parquet
+pitedgar build --identity "Name name@email.com"
+# Query a single value
+pitedgar query --ticker AAPL --concept us-gaap:Revenues --as-of 2023-06-30
+```
+---
+## Key design decisions
+| Decision | Rationale |
+|---|---|
+| `filed` as PIT timestamp | The date the filing was submitted to SEC — this is when information became public |
+| Deduplication keeps latest `filed` per `(concept, end)` | Companies sometimes refile restated figures; keep the superseding value |
+| Raw USD values, no scale conversion | SEC reports values as-filed; downstream code applies any needed normalization |
+| Local parquet, no runtime HTTP | Queries run at DataFrame speed with no network dependency |
+---
+## Supported XBRL concepts (defaults)
+See `pitedgar.config.DEFAULT_CONCEPTS` for the full list, which includes
+revenues, net income, assets, liabilities, equity, EPS, cash, debt, operating
+cash flow, capex, and R&D expense.
+---
+## Examples
+- [`examples/fcf_sp500.py`](examples/fcf_sp500.py) — S&P 500 free cash flow benchmark: fetches constituents, builds the parquet, and queries FCF cross-sections across 20 quarters. Useful as an end-to-end performance reference.
+---
+## Contributing
+Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, coding conventions, and the PR process.
+---
+## License
+[MIT](LICENSE)

pitedgar-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,114 @@
+# pitedgar
+[![CI](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml/badge.svg)](https://github.com/arielNacamulli/pitedgar/actions/workflows/ci.yml)
+[![PyPI version](https://img.shields.io/pypi/v/pitedgar.svg)](https://pypi.org/project/pitedgar/)
+[![Python versions](https://img.shields.io/pypi/pyversions/pitedgar.svg)](https://pypi.org/project/pitedgar/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+Point-in-time SEC EDGAR financial data pipeline.
+Downloads SEC EDGAR `companyfacts.zip`, parses XBRL JSON facts into a local
+parquet file, and exposes a query API with **zero look-ahead bias** — every
+value is stamped with the `filed` date (when the data was actually available
+to the market), not the period-end date.
+---
+## Installation
+```bash
+pip install pitedgar
+# or with Poetry
+poetry install
+```
+---
+## Quick start
+```python
+from pathlib import Path
+from pitedgar import PitEdgarConfig, build_cik_map, download_bulk, parse_all, PitQuery
+config = PitEdgarConfig(
+    edgar_identity="Mario Rossi mario@example.com",  # required by SEC
+    data_dir=Path("./data"),
+)
+# Step 1 — one-shot ticker → CIK mapping
+tickers = ["AAPL", "MSFT", "JPM", "GOOGL"]
+cik_map = build_cik_map(tickers, config)
+# Step 2 — download ~1.5 GB bulk ZIP (do this periodically, not every run)
+download_bulk(config)
+# Step 3 — parse JSON → parquet (sub-minute for 500 companies)
+master = parse_all(config, cik_map)
+# Step 4 — query
+q = PitQuery(config.data_dir / "pit_financials.parquet")
+# What revenue figure was available to the market on 2022-06-30?
+result = q.as_of(["AAPL", "MSFT"], "us-gaap:Revenues", "2022-06-30")
+# Full history
+hist = q.history("AAPL", "us-gaap:NetIncomeLoss", freq="A")
+# Portfolio cross-section signal
+xs = q.cross_section("us-gaap:NetIncomeLoss", "2023-12-31")
+```
+---
+## CLI
+```bash
+# Resolve tickers (tickers.txt has one ticker per line)
+pitedgar map --tickers tickers.txt --identity "Name name@email.com"
+# Download bulk ZIP
+pitedgar fetch --identity "Name name@email.com"
+# Parse to parquet
+pitedgar build --identity "Name name@email.com"
+# Query a single value
+pitedgar query --ticker AAPL --concept us-gaap:Revenues --as-of 2023-06-30
+```
+---
+## Key design decisions
+| Decision | Rationale |
+|---|---|
+| `filed` as PIT timestamp | The date the filing was submitted to SEC — this is when information became public |
+| Deduplication keeps latest `filed` per `(concept, end)` | Companies sometimes refile restated figures; keep the superseding value |
+| Raw USD values, no scale conversion | SEC reports values as-filed; downstream code applies any needed normalization |
+| Local parquet, no runtime HTTP | Queries run at DataFrame speed with no network dependency |
+---
+## Supported XBRL concepts (defaults)
+See `pitedgar.config.DEFAULT_CONCEPTS` for the full list, which includes
+revenues, net income, assets, liabilities, equity, EPS, cash, debt, operating
+cash flow, capex, and R&D expense.
+---
+## Examples
+- [`examples/fcf_sp500.py`](examples/fcf_sp500.py) — S&P 500 free cash flow benchmark: fetches constituents, builds the parquet, and queries FCF cross-sections across 20 quarters. Useful as an end-to-end performance reference.
+---
+## Contributing
+Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for setup instructions, coding conventions, and the PR process.
+---
+## License
+[MIT](LICENSE)

pitedgar-0.1.0/pitedgar/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""pitedgar — Point-in-time SEC EDGAR financial data pipeline."""
+__version__ = "0.1.0"
+from pitedgar.config import PitEdgarConfig
+from pitedgar.mapping import build_cik_map
+from pitedgar.downloader import download_bulk
+from pitedgar.parser import parse_all
+from pitedgar.query import PitQuery
+__all__ = [
+    "PitEdgarConfig",
+    "build_cik_map",
+    "download_bulk",
+    "parse_all",
+    "PitQuery",
+]

pitedgar-0.1.0/pitedgar/cli.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""CLI entry points for pitedgar."""
+from pathlib import Path
+import click
+import pandas as pd
+from pitedgar.config import PitEdgarConfig
+from pitedgar.downloader import download_bulk
+from pitedgar.mapping import build_cik_map
+from pitedgar.parser import parse_all
+from pitedgar.query import PitQuery
+@click.group()
+def cli() -> None:
+    """pitedgar — SEC EDGAR point-in-time financial data pipeline."""
+@cli.command("map")
+@click.option("--tickers", "tickers_file", required=True, type=click.Path(exists=True),
+              help="Path to a text file with one ticker per line.")
+@click.option("--identity", required=True,
+              help='SEC identity string, e.g. "Name name@email.com".')
+@click.option("--data-dir", default="./data", show_default=True,
+              help="Directory where outputs are saved.")
+def cmd_map(tickers_file: str, identity: str, data_dir: str) -> None:
+    """Resolve tickers to CIK numbers and save the mapping."""
+    tickers = Path(tickers_file).read_text().splitlines()
+    tickers = [t.strip().upper() for t in tickers if t.strip()]
+    config = PitEdgarConfig(edgar_identity=identity, data_dir=Path(data_dir))
+    cik_map = build_cik_map(tickers, config)
+    click.echo(f"Mapped {len(cik_map)} tickers → {config.data_dir / 'ticker_cik_map.parquet'}")
+@cli.command("fetch")
+@click.option("--force", is_flag=True, default=False,
+              help="Re-extract even if facts_dir already populated.")
+@click.option("--identity", required=True,
+              help='SEC identity string, e.g. "Name name@email.com".')
+@click.option("--data-dir", default="./data", show_default=True)
+def cmd_fetch(force: bool, identity: str, data_dir: str) -> None:
+    """Download and extract the SEC companyfacts bulk ZIP."""
+    config = PitEdgarConfig(edgar_identity=identity, data_dir=Path(data_dir))
+    facts_dir = download_bulk(config, force=force)
+    click.echo(f"Facts extracted to {facts_dir}")
+@cli.command("build")
+@click.option("--identity", required=True,
+              help='SEC identity string, e.g. "Name name@email.com".')
+@click.option("--data-dir", default="./data", show_default=True)
+def cmd_build(identity: str, data_dir: str) -> None:
+    """Parse all local JSON facts into the master PIT parquet."""
+    config = PitEdgarConfig(edgar_identity=identity, data_dir=Path(data_dir))
+    cik_map = pd.read_parquet(config.data_dir / "ticker_cik_map.parquet")
+    master = parse_all(config, cik_map)
+    click.echo(f"Built master parquet: {len(master):,} rows")
+@cli.command("query")
+@click.option("--ticker", required=True)
+@click.option("--concept", required=True, help='e.g. "us-gaap:Revenues"')
+@click.option("--as-of", "as_of", required=True, help="ISO date, e.g. 2023-06-30")
+@click.option("--data-dir", default="./data", show_default=True)
+def cmd_query(ticker: str, concept: str, as_of: str, data_dir: str) -> None:
+    """Query the latest PIT value for a ticker/concept as of a date."""
+    parquet_path = Path(data_dir) / "pit_financials.parquet"
+    q = PitQuery(parquet_path)
+    result = q.as_of([ticker.upper()], concept, as_of)
+    click.echo(result.to_string(index=False))

pitedgar-0.1.0/pitedgar/config.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Configuration for pitedgar via Pydantic BaseModel."""
+from pathlib import Path
+from pydantic import BaseModel, model_validator
+DEFAULT_CONCEPTS = [
+    "us-gaap:Revenues",
+    "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
+    "us-gaap:NetIncomeLoss",
+    "us-gaap:Assets",
+    "us-gaap:Liabilities",
+    "us-gaap:StockholdersEquity",
+    "us-gaap:OperatingIncomeLoss",
+    "us-gaap:GrossProfit",
+    "us-gaap:EarningsPerShareBasic",
+    "us-gaap:EarningsPerShareDiluted",
+    "us-gaap:CommonStockSharesOutstanding",
+    "us-gaap:CashAndCashEquivalentsAtCarryingValue",
+    "us-gaap:LongTermDebt",
+    "us-gaap:OperatingCashFlow",
+    "us-gaap:NetCashProvidedByUsedInOperatingActivities",
+    "us-gaap:CapitalExpendituresIncurredButNotYetPaid",
+    "us-gaap:ResearchAndDevelopmentExpense",
+]
+DEFAULT_FORMS = ["10-K", "10-Q"]
+BULK_ZIP_URL = "https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip"
+class PitEdgarConfig(BaseModel):
+    edgar_identity: str
+    data_dir: Path
+    facts_dir: Path | None = None
+    zip_url: str = BULK_ZIP_URL
+    concepts: list[str] = DEFAULT_CONCEPTS
+    forms: list[str] = DEFAULT_FORMS
+    model_config = {"arbitrary_types_allowed": True}
+    @model_validator(mode="after")
+    def set_facts_dir(self) -> "PitEdgarConfig":
+        if self.facts_dir is None:
+            self.facts_dir = self.data_dir / "companyfacts"
+        return self
+    def ensure_dirs(self) -> None:
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.facts_dir.mkdir(parents=True, exist_ok=True)

pitedgar-0.1.0/pitedgar/downloader.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Step 2: bulk download companyfacts.zip from SEC EDGAR."""
+import zipfile
+from pathlib import Path
+import requests
+from loguru import logger
+from tqdm import tqdm
+from pitedgar.config import PitEdgarConfig
+def download_bulk(config: PitEdgarConfig, force: bool = False) -> Path:
+    """Download and extract the SEC companyfacts bulk ZIP.
+    Args:
+        config: pipeline configuration.
+        force: re-extract even if facts_dir already exists.
+    Returns:
+        Path to the extracted facts directory.
+    """
+    config.ensure_dirs()
+    zip_path = config.data_dir / "companyfacts.zip"
+    headers = {"User-Agent": config.edgar_identity}
+    logger.info(f"Downloading {config.zip_url} …")
+    with requests.get(config.zip_url, stream=True, headers=headers, timeout=120) as resp:
+        resp.raise_for_status()
+        total = int(resp.headers.get("Content-Length", 0)) or None
+        with (
+            open(zip_path, "wb") as fh,
+            tqdm(
+                total=total,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+                desc="companyfacts.zip",
+            ) as bar,
+        ):
+            for chunk in resp.iter_content(chunk_size=1 << 20):
+                fh.write(chunk)
+                bar.update(len(chunk))
+    logger.info(f"ZIP saved: {zip_path}")
+    facts_dir = config.facts_dir
+    if force or not facts_dir.exists() or not any(facts_dir.iterdir()):
+        logger.info(f"Extracting to {facts_dir} …")
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            members = zf.namelist()
+            for member in tqdm(members, desc="Extracting", unit="file"):
+                zf.extract(member, facts_dir)
+        logger.info("Extraction complete.")
+    else:
+        logger.info(f"Facts dir already populated, skipping extraction (use force=True to override).")
+    return facts_dir

pitedgar-0.1.0/pitedgar/mapping.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Step 1: resolve ticker → CIK via edgartools."""
+import time
+import edgar
+import pandas as pd
+from loguru import logger
+from pitedgar.config import PitEdgarConfig
+def build_cik_map(tickers: list[str], config: PitEdgarConfig) -> pd.DataFrame:
+    """Resolve a list of tickers to CIK numbers via edgartools.
+    Saves the result to data_dir/ticker_cik_map.parquet.
+    Returns a DataFrame indexed by ticker with columns:
+        cik, name, sic, fiscal_year_end, exchange
+    """
+    edgar.set_identity(config.edgar_identity)
+    config.ensure_dirs()
+    records: list[dict] = []
+    for ticker in tickers:
+        try:
+            company = edgar.Company(ticker)
+            cik_padded = f"{company.cik:010d}"
+            records.append(
+                {
+                    "ticker": ticker.upper(),
+                    "cik": cik_padded,
+                    "name": getattr(company, "name", None),
+                    "sic": getattr(company, "sic", None),
+                    "fiscal_year_end": getattr(company, "fiscal_year_end", None),
+                    "exchange": getattr(company, "exchange", None),
+                }
+            )
+            logger.debug(f"{ticker} → CIK {cik_padded}")
+        except Exception as exc:
+            logger.warning(f"Could not resolve ticker '{ticker}': {exc}")
+        time.sleep(0.1)
+    df = pd.DataFrame(records).set_index("ticker")
+    out_path = config.data_dir / "ticker_cik_map.parquet"
+    df.to_parquet(out_path)
+    logger.info(f"CIK map saved: {out_path} ({len(df)} rows)")
+    return df

pitedgar-0.1.0/pitedgar/parser.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Step 3: parse local JSON facts into a point-in-time parquet master."""
+import json
+from pathlib import Path
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+from pitedgar.config import PitEdgarConfig
+# Units to attempt per concept, in priority order.
+# EPS and share concepts use "shares"; everything else USD.
+_SHARE_CONCEPTS = {
+    "EarningsPerShareBasic",
+    "EarningsPerShareDiluted",
+    "CommonStockSharesOutstanding",
+}
+def _preferred_units(concept_short: str) -> list[str]:
+    if concept_short in _SHARE_CONCEPTS:
+        return ["shares", "USD"]
+    return ["USD", "shares"]
+def parse_company(
+    cik_padded: str,
+    concepts: list[str],
+    facts_dir: Path,
+    forms: list[str],
+) -> pd.DataFrame:
+    """Parse a single company's JSON into a tidy PIT DataFrame.
+    Args:
+        cik_padded: zero-padded 10-digit CIK string.
+        concepts:   list of "us-gaap:ConceptName" strings.
+        facts_dir:  directory containing CIK*.json files.
+        forms:      filing forms to keep, e.g. ["10-K", "10-Q"].
+    Returns:
+        DataFrame with columns: cik, concept, end, filed, val, form, accn
+    """
+    json_path = facts_dir / f"CIK{cik_padded}.json"
+    if not json_path.exists():
+        logger.debug(f"No JSON for CIK {cik_padded}, skipping.")
+        return pd.DataFrame()
+    with open(json_path, "r", encoding="utf-8") as fh:
+        data = json.load(fh)
+    usgaap = data.get("facts", {}).get("us-gaap", {})
+    if not usgaap:
+        return pd.DataFrame()
+    rows: list[dict] = []
+    for concept_full in concepts:
+        # concept_full is like "us-gaap:Revenues"
+        parts = concept_full.split(":", 1)
+        concept_short = parts[1] if len(parts) == 2 else parts[0]
+        concept_data = usgaap.get(concept_short)
+        if concept_data is None:
+            continue
+        units_dict: dict = concept_data.get("units", {})
+        unit_entries: list[dict] | None = None
+        for unit_key in _preferred_units(concept_short):
+            if unit_key in units_dict:
+                unit_entries = units_dict[unit_key]
+                break
+        if not unit_entries:
+            continue
+        for entry in unit_entries:
+            form = entry.get("form", "")
+            if form not in forms:
+                continue
+            end = entry.get("end")
+            filed = entry.get("filed")
+            val = entry.get("val")
+            accn = entry.get("accn", "")
+            if end is None or filed is None or val is None:
+                continue
+            rows.append(
+                {
+                    "cik": cik_padded,
+                    "concept": concept_full,
+                    "end": end,
+                    "filed": filed,
+                    "val": float(val),
+                    "form": form,
+                    "accn": accn,
+                }
+            )
+    if not rows:
+        return pd.DataFrame()
+    df = pd.DataFrame(rows)
+    df["end"] = pd.to_datetime(df["end"], errors="coerce")
+    df["filed"] = pd.to_datetime(df["filed"], errors="coerce")
+    df = df.dropna(subset=["end", "filed"])
+    # PIT deduplication: for each (concept, end), keep the most recently filed record.
+    df = (
+        df.sort_values("filed")
+        .drop_duplicates(subset=["concept", "end"], keep="last")
+        .sort_values("filed")
+        .reset_index(drop=True)
+    )
+    return df
+def parse_all(config: PitEdgarConfig, cik_map: pd.DataFrame) -> pd.DataFrame:
+    """Parse all companies in cik_map into a single PIT master parquet.
+    Args:
+        config:  pipeline configuration.
+        cik_map: DataFrame indexed by ticker with a 'cik' column.
+    Returns:
+        Master DataFrame with an additional 'ticker' column.
+    """
+    config.ensure_dirs()
+    all_frames: list[pd.DataFrame] = []
+    for ticker, row in tqdm(cik_map.iterrows(), total=len(cik_map), desc="Parsing"):
+        cik_padded = str(row["cik"])
+        df = parse_company(
+            cik_padded=cik_padded,
+            concepts=config.concepts,
+            facts_dir=config.facts_dir,
+            forms=config.forms,
+        )
+        if df.empty:
+            continue
+        df.insert(0, "ticker", ticker)
+        all_frames.append(df)
+    if not all_frames:
+        logger.warning("No data parsed — check facts_dir and CIK map.")
+        return pd.DataFrame()
+    master = pd.concat(all_frames, ignore_index=True)
+    out_path = config.data_dir / "pit_financials.parquet"
+    master.to_parquet(out_path, index=False)
+    logger.info(f"Master parquet saved: {out_path} ({len(master):,} rows)")
+    return master

pitedgar-0.1.0/pitedgar/query.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""PIT query API over the master parquet."""
+from pathlib import Path
+import pandas as pd
+class PitQuery:
+    """Query point-in-time financial data from a master parquet file."""
+    def __init__(self, parquet_path: Path) -> None:
+        self.data = pd.read_parquet(parquet_path)
+        self.data["filed"] = pd.to_datetime(self.data["filed"], errors="coerce")
+        self.data["end"] = pd.to_datetime(self.data["end"], errors="coerce")
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def as_of(
+        self,
+        tickers: list[str] | str,
+        concept: str,
+        as_of_date: str | pd.Timestamp,
+        max_staleness_days: int = 180,
+    ) -> pd.DataFrame:
+        """Last known value of a concept for each ticker as of a given date.
+        Only filings with filed <= as_of_date are considered (no look-ahead).
+        Tickers whose most recent filing is older than max_staleness_days
+        return NaN for val.
+        Returns DataFrame with columns: ticker, val, filed, end, form
+        """
+        if isinstance(tickers, str):
+            tickers = [tickers]
+        as_of_ts = pd.Timestamp(as_of_date)
+        cutoff = as_of_ts - pd.Timedelta(days=max_staleness_days)
+        mask = (
+            self.data["ticker"].isin(tickers)
+            & (self.data["concept"] == concept)
+            & (self.data["filed"] <= as_of_ts)
+        )
+        sub = self.data.loc[mask].copy()
+        if sub.empty:
+            return self._empty_result(tickers)
+        # Most recently filed record per ticker
+        idx = sub.groupby("ticker")["filed"].idxmax()
+        result = sub.loc[idx, ["ticker", "val", "filed", "end", "form"]].copy()
+        # Nullify stale values
+        stale = result["filed"] < cutoff
+        result.loc[stale, "val"] = float("nan")
+        # Add tickers with no data at all
+        missing = set(tickers) - set(result["ticker"])
+        if missing:
+            filler = pd.DataFrame(
+                {"ticker": list(missing), "val": float("nan"), "filed": pd.NaT, "end": pd.NaT, "form": None}
+            )
+            result = pd.concat([result, filler], ignore_index=True)
+        return result.reset_index(drop=True)
+    def history(
+        self,
+        ticker: str,
+        concept: str,
+        start_date: str | None = None,
+        end_date: str | None = None,
+        freq: str = "Q",
+    ) -> pd.DataFrame:
+        """Point-in-time history of a concept for a single ticker.
+        Args:
+            ticker:     company ticker symbol.
+            concept:    XBRL concept, e.g. "us-gaap:Revenues".
+            start_date: filter end >= start_date (ISO string).
+            end_date:   filter end <= end_date (ISO string).
+            freq:       "Q" → 10-Q only, "A" → 10-K only, anything else → all.
+        Returns DataFrame with columns: ticker, concept, end, filed, val, form, accn
+        """
+        mask = (self.data["ticker"] == ticker) & (self.data["concept"] == concept)
+        sub = self.data.loc[mask].copy()
+        if freq == "Q":
+            sub = sub[sub["form"] == "10-Q"]
+        elif freq == "A":
+            sub = sub[sub["form"] == "10-K"]
+        if start_date is not None:
+            sub = sub[sub["end"] >= pd.Timestamp(start_date)]
+        if end_date is not None:
+            sub = sub[sub["end"] <= pd.Timestamp(end_date)]
+        return sub.sort_values("filed").reset_index(drop=True)
+    def cross_section(
+        self,
+        concept: str,
+        as_of_date: str,
+        tickers: list[str] | None = None,
+        max_staleness_days: int = 180,
+    ) -> pd.DataFrame:
+        """Cross-section of all (or a subset of) tickers for a concept at a date.
+        Useful for building portfolio signals.
+        Returns DataFrame with columns: ticker, val, filed, end, form
+        """
+        universe = tickers if tickers is not None else self.data["ticker"].unique().tolist()
+        return self.as_of(
+            tickers=universe,
+            concept=concept,
+            as_of_date=as_of_date,
+            max_staleness_days=max_staleness_days,
+        )
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _empty_result(tickers: list[str]) -> pd.DataFrame:
+        return pd.DataFrame(
+            {
+                "ticker": tickers,
+                "val": float("nan"),
+                "filed": pd.NaT,
+                "end": pd.NaT,
+                "form": None,
+            }
+        )

pitedgar-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,46 @@
+[tool.poetry]
+name = "pitedgar"
+version = "0.1.0"
+description = "Point-in-time SEC EDGAR financial data pipeline"
+authors = ["Ariel Nacamulli"]
+license = "MIT"
+readme = "README.md"
+homepage = "https://github.com/arielNacamulli/pitedgar"
+repository = "https://github.com/arielNacamulli/pitedgar"
+documentation = "https://github.com/arielNacamulli/pitedgar#readme"
+keywords = ["SEC", "EDGAR", "financial-data", "point-in-time", "XBRL", "backtesting"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Financial and Insurance Industry",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Office/Business :: Financial",
+    "Topic :: Scientific/Engineering",
+    "Typing :: Typed",
+]
+packages = [{ include = "pitedgar" }]
+[tool.poetry.dependencies]
+python = ">=3.11"
+edgartools = ">=5.0"
+pandas = ">=2.0"
+pyarrow = "*"
+requests = "*"
+tqdm = "*"
+loguru = "*"
+pydantic = ">=2.0"
+click = "*"
+[tool.poetry.group.dev.dependencies]
+pytest = "*"
+pytest-mock = "*"
+[tool.poetry.scripts]
+pitedgar = "pitedgar.cli:cli"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"