xfinlink 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfinlink-0.1.1/CLAUDE.md +121 -0
- xfinlink-0.1.1/MANIFEST.in +7 -0
- xfinlink-0.1.1/PKG-INFO +171 -0
- xfinlink-0.1.1/README.md +131 -0
- xfinlink-0.1.1/pyproject.toml +69 -0
- xfinlink-0.1.1/setup.cfg +4 -0
- xfinlink-0.1.1/xfinlink/__init__.py +129 -0
- xfinlink-0.1.1/xfinlink/_db_check.py +23 -0
- xfinlink-0.1.1/xfinlink/alembic.ini +149 -0
- xfinlink-0.1.1/xfinlink/api/__init__.py +0 -0
- xfinlink-0.1.1/xfinlink/api/main.py +287 -0
- xfinlink-0.1.1/xfinlink/cli.py +443 -0
- xfinlink-0.1.1/xfinlink/db.py +35 -0
- xfinlink-0.1.1/xfinlink/ingestion/__init__.py +0 -0
- xfinlink-0.1.1/xfinlink/ingestion/corporate_events.py +403 -0
- xfinlink-0.1.1/xfinlink/ingestion/edgar_cik.py +529 -0
- xfinlink-0.1.1/xfinlink/ingestion/utils.py +70 -0
- xfinlink-0.1.1/xfinlink/ingestion/wrds_ccm.py +232 -0
- xfinlink-0.1.1/xfinlink/ingestion/wrds_compustat.py +489 -0
- xfinlink-0.1.1/xfinlink/ingestion/wrds_crsp.py +340 -0
- xfinlink-0.1.1/xfinlink/ingestion/wrds_ibes.py +209 -0
- xfinlink-0.1.1/xfinlink/license.py +36 -0
- xfinlink-0.1.1/xfinlink/mcp_server/__init__.py +0 -0
- xfinlink-0.1.1/xfinlink/mcp_server/server.py +328 -0
- xfinlink-0.1.1/xfinlink/migrations/README +1 -0
- xfinlink-0.1.1/xfinlink/migrations/env.py +75 -0
- xfinlink-0.1.1/xfinlink/migrations/script.py.mako +28 -0
- xfinlink-0.1.1/xfinlink/migrations/versions/39d93271e7d8_initial_schema.py +122 -0
- xfinlink-0.1.1/xfinlink/models.py +157 -0
- xfinlink-0.1.1/xfinlink/panel.py +264 -0
- xfinlink-0.1.1/xfinlink/resolver/__init__.py +0 -0
- xfinlink-0.1.1/xfinlink/resolver/core.py +405 -0
- xfinlink-0.1.1/xfinlink.egg-info/PKG-INFO +171 -0
- xfinlink-0.1.1/xfinlink.egg-info/SOURCES.txt +36 -0
- xfinlink-0.1.1/xfinlink.egg-info/dependency_links.txt +1 -0
- xfinlink-0.1.1/xfinlink.egg-info/entry_points.txt +2 -0
- xfinlink-0.1.1/xfinlink.egg-info/requires.txt +21 -0
- xfinlink-0.1.1/xfinlink.egg-info/top_level.txt +1 -0
xfinlink-0.1.1/CLAUDE.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# xfinlink — guidance for AI agents
|
|
2
|
+
|
|
3
|
+
xfinlink is a local entity-resolution tool for financial data. It answers one
|
|
4
|
+
question: **"which economic entity does this identifier refer to, and what
|
|
5
|
+
other identifiers does that same entity have?"** It ships five MCP tools
|
|
6
|
+
(`resolve_identifier`, `bulk_resolve`, `entity_lineage`, `search_entity`,
|
|
7
|
+
`validate_identifiers`), a Python API, a CLI, and a local HTTP server — all
|
|
8
|
+
backed by the same Postgres crosswalk database built from CRSP, Compustat,
|
|
9
|
+
IBES, and SEC EDGAR.
|
|
10
|
+
|
|
11
|
+
If you are reasoning about a dataset that mixes any of the following
|
|
12
|
+
identifiers, you should be using xfinlink:
|
|
13
|
+
|
|
14
|
+
- **PERMNO / PERMCO** — CRSP (market data: prices, returns, volume)
|
|
15
|
+
- **GVKEY** — Compustat (fundamentals: balance sheet, income statement)
|
|
16
|
+
- **CIK** — SEC EDGAR (filings: 10-K, 10-Q, 8-K, 13F)
|
|
17
|
+
- **CUSIP (8 or 9)** — bond/equity master files
|
|
18
|
+
- **Ticker** — market data vendors (Yahoo, Bloomberg, yfinance, FRED)
|
|
19
|
+
- **ISIN / IBES ticker / FIGI** — international, analyst estimates, OpenFIGI
|
|
20
|
+
|
|
21
|
+
## Why this matters: the ticker-recycling problem
|
|
22
|
+
|
|
23
|
+
Tickers are not stable identifiers. They get retired, reissued, and reused
|
|
24
|
+
across companies that have nothing to do with each other. A naive
|
|
25
|
+
ticker-based merge across databases silently stitches unrelated entities into
|
|
26
|
+
one row-group and corrupts downstream analysis. **Nearly 10,000 tickers in
|
|
27
|
+
the CRSP universe map to more than one distinct entity over time.** A few
|
|
28
|
+
high-salience examples:
|
|
29
|
+
|
|
30
|
+
- **GM** — Before June 2009, "GM" was General Motors Corporation (PERMNO
|
|
31
|
+
12079). The company filed Chapter 11 on 2009-06-01. "Old GM" was delisted
|
|
32
|
+
and eventually wound down. On 2010-11-18, an entirely new company, General
|
|
33
|
+
Motors Company (PERMNO 12369), IPO'd out of the bankruptcy and took the
|
|
34
|
+
"GM" ticker. They share the same GVKEY (005073) because Compustat models
|
|
35
|
+
the economic continuation, but they have different PERMNOs because CRSP
|
|
36
|
+
treats them as legally distinct securities. A panel keyed on ticker "GM"
|
|
37
|
+
across 2005-2024 is a silent 50/50 mixture of two different companies.
|
|
38
|
+
- **FB** — Belonged to Meta Platforms Inc until the 2022 rebrand to META.
|
|
39
|
+
Historically the "FB" ticker has also been used by PROSHARES, FBR Capital,
|
|
40
|
+
BankBoston, and Falcon Building Products. Resolving "FB" without a date
|
|
41
|
+
returns five matches.
|
|
42
|
+
- **META** — The ticker was used by another company before Meta adopted it
|
|
43
|
+
in 2022.
|
|
44
|
+
- **INGR** — Recycled multiple times across unrelated corporations over the
|
|
45
|
+
decades.
|
|
46
|
+
|
|
47
|
+
When you see ambiguous=true from `resolve_identifier`, treat it as a
|
|
48
|
+
disambiguation prompt — either pass `as_of_date`, or call `entity_lineage`
|
|
49
|
+
to understand the family tree.
|
|
50
|
+
|
|
51
|
+
## Which tool to use when
|
|
52
|
+
|
|
53
|
+
- **You have an id_value and need the entity (and all its other ids):**
|
|
54
|
+
`resolve_identifier`. Pass `as_of_date` if the data is time-series.
|
|
55
|
+
- **You have many ids to resolve in one shot:** `bulk_resolve` (max 100).
|
|
56
|
+
- **You have a company name but no id:** `search_entity` → pick a result →
|
|
57
|
+
`resolve_identifier` on one of its PERMNO/GVKEY values for the full record.
|
|
58
|
+
- **You have a merged panel and want to audit it:** `validate_identifiers`.
|
|
59
|
+
Run it AFTER merging. Any `wrong_entity` flag means your panel has stitched
|
|
60
|
+
different companies under one ticker.
|
|
61
|
+
- **You need to understand a company's corporate history:** `entity_lineage`.
|
|
62
|
+
Walks the predecessor/successor graph across bankruptcies, mergers,
|
|
63
|
+
spinoffs, name changes, and ticker changes.
|
|
64
|
+
|
|
65
|
+
## Canonical merge keys
|
|
66
|
+
|
|
67
|
+
Once you have a resolution, **prefer PERMNO or GVKEY as the merge key** in
|
|
68
|
+
downstream joins. Both are permanent identifiers issued by data vendors that
|
|
69
|
+
survive ticker changes, reorganizations, and most corporate events. Tickers
|
|
70
|
+
are display values, not keys. CIKs are keys for SEC filings but are not
|
|
71
|
+
always unique across subsidiaries and parents.
|
|
72
|
+
|
|
73
|
+
In order of preference for panel construction:
|
|
74
|
+
|
|
75
|
+
1. **GVKEY** — most stable for economic-continuation analysis (survives
|
|
76
|
+
bankruptcies where the business continues).
|
|
77
|
+
2. **PERMNO** — most precise for market-data analysis (distinguishes the
|
|
78
|
+
pre- and post-bankruptcy security, which GVKEY does not).
|
|
79
|
+
3. **CIK** — use for SEC filings. Not recommended as the primary merge key
|
|
80
|
+
because a single parent filer can cover multiple economic entities.
|
|
81
|
+
4. **CUSIP (9)** — useful for bond/equity-level joins but changes on some
|
|
82
|
+
corporate actions.
|
|
83
|
+
5. **Ticker** — display only. Never the merge key.
|
|
84
|
+
|
|
85
|
+
## Historical resolution
|
|
86
|
+
|
|
87
|
+
Almost every financial dataset is a panel. `as_of_date` is the most
|
|
88
|
+
important parameter in xfinlink. Use YYYY-MM-DD format. The resolver applies
|
|
89
|
+
the temporal filter `valid_from <= as_of_date <= valid_to` against each
|
|
90
|
+
identifier's validity window, with NULL treated as "open-ended."
|
|
91
|
+
|
|
92
|
+
If a user asks "who is AAPL?" without a date, that's probably fine (AAPL has
|
|
93
|
+
been Apple Inc since 1980). If they ask "what was FB's GVKEY in 2019?", use
|
|
94
|
+
`as_of_date="2019-01-01"` to avoid the 5-way ambiguity.
|
|
95
|
+
|
|
96
|
+
## Validation workflow
|
|
97
|
+
|
|
98
|
+
The canonical ticker-recycling bug hides in silence: the merge succeeds, the
|
|
99
|
+
DataFrame has the right number of rows, the regression runs, and the results
|
|
100
|
+
are quietly wrong. The defense is to run `validate_identifiers` on every
|
|
101
|
+
panel before trusting it.
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
validate_identifiers(identifiers='[{"id_type":"ticker","id_value":"GM","date":"2006-01-01"}, {"id_type":"ticker","id_value":"GM","date":"2022-01-01"}]')
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
If `is_clean` is false and the issues include `wrong_entity`, re-key on
|
|
108
|
+
PERMNO or GVKEY and re-merge. The `coverage` dict shows you which canonical
|
|
109
|
+
entities the panel actually matched and their row counts, so a stitched GM
|
|
110
|
+
panel will read `{"General Motors Corporation (pre-2009 bankruptcy)": 4,
|
|
111
|
+
"General Motors Company": 6}` — a 50/50 breakdown that's an unmistakable
|
|
112
|
+
signal of the problem.
|
|
113
|
+
|
|
114
|
+
## Non-goals
|
|
115
|
+
|
|
116
|
+
- xfinlink does not serve market data, prices, fundamentals, or filings. It
|
|
117
|
+
is a crosswalk, not a data provider.
|
|
118
|
+
- xfinlink does not guess. If `not_found` is true, the identifier genuinely
|
|
119
|
+
is not in the crosswalk — do not fabricate a match.
|
|
120
|
+
- xfinlink does not resolve private companies, funds, or non-US entities
|
|
121
|
+
outside what CRSP / Compustat / EDGAR cover.
|
xfinlink-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xfinlink
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Financial entity identifier resolution — resolve, validate, and fix PERMNO/GVKEY/CIK/CUSIP/ticker/ISIN mappings across CRSP, Compustat, EDGAR, and more.
|
|
5
|
+
Author: xfinlink
|
|
6
|
+
License-Expression: LicenseRef-Proprietary
|
|
7
|
+
Project-URL: Homepage, https://xfinlink.io
|
|
8
|
+
Project-URL: Documentation, https://xfinlink.io/docs
|
|
9
|
+
Keywords: finance,CRSP,Compustat,EDGAR,PERMNO,GVKEY,CUSIP,ticker,identifier,resolver
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: pandas
|
|
22
|
+
Requires-Dist: pyarrow
|
|
23
|
+
Requires-Dist: sqlalchemy>=2.0
|
|
24
|
+
Requires-Dist: psycopg2-binary
|
|
25
|
+
Requires-Dist: asyncpg
|
|
26
|
+
Requires-Dist: greenlet
|
|
27
|
+
Requires-Dist: fastapi
|
|
28
|
+
Requires-Dist: uvicorn
|
|
29
|
+
Requires-Dist: httpx
|
|
30
|
+
Requires-Dist: alembic
|
|
31
|
+
Requires-Dist: pydantic
|
|
32
|
+
Requires-Dist: python-dotenv
|
|
33
|
+
Requires-Dist: typer[all]
|
|
34
|
+
Requires-Dist: mcp
|
|
35
|
+
Provides-Extra: wrds
|
|
36
|
+
Requires-Dist: wrds; extra == "wrds"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
40
|
+
|
|
41
|
+
# xfinlink
|
|
42
|
+
|
|
43
|
+
Financial entity identifier resolution. Resolve, validate, and fix identifier mappings across CRSP, Compustat, SEC EDGAR, IBES, and more.
|
|
44
|
+
|
|
45
|
+
## The problem
|
|
46
|
+
|
|
47
|
+
Financial data is spread across databases that use different identifiers. CRSP uses PERMNOs. Compustat uses GVKEYs. SEC EDGAR uses CIKs. Markets use tickers. Tickers get recycled — GM before 2009 (General Motors Corporation) and GM after 2010 (General Motors Company) are legally distinct entities that happen to share a ticker. A naive merge silently produces wrong data.
|
|
48
|
+
|
|
49
|
+
Nearly 10,000 tickers in the CRSP universe map to more than one distinct entity over time. The bug hides in silence: the merge succeeds, the DataFrame has the right number of rows, the regression runs, and the results are quietly wrong.
|
|
50
|
+
|
|
51
|
+
## What xfinlink does
|
|
52
|
+
|
|
53
|
+
- **Resolve** — any identifier → all other identifiers for the same entity, correctly matched at any point in time
|
|
54
|
+
- **Validate** — audit a merged panel and flag silent identifier errors (ticker recycling, outside-validity dates, ambiguous matches)
|
|
55
|
+
- **Fix** — return a corrected DataFrame with proper per-row entity, PERMNO, GVKEY, and CIK columns
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install xfinlink
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
WRDS ingestion is optional:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install "xfinlink[wrds]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick start
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
xfinlink init # start postgres + run migrations
|
|
73
|
+
xfinlink build # ingest EDGAR (public)
|
|
74
|
+
xfinlink build --wrds # optional: add CRSP/Compustat/IBES
|
|
75
|
+
|
|
76
|
+
xfinlink status # DB census + identifier coverage
|
|
77
|
+
xfinlink resolve ticker AAPL # → Apple Inc with all linked ids
|
|
78
|
+
xfinlink resolve ticker GM --date 2008-01-01
|
|
79
|
+
# → General Motors Corporation (pre-bankruptcy)
|
|
80
|
+
xfinlink resolve ticker GM --date 2020-01-01
|
|
81
|
+
# → General Motors Company (post-bankruptcy)
|
|
82
|
+
|
|
83
|
+
xfinlink validate panel.csv --id-column ticker --date-column date
|
|
84
|
+
# → flags stitched/ambiguous/not-found rows
|
|
85
|
+
xfinlink fix panel.csv --id-column ticker --date-column date
|
|
86
|
+
# → writes panel_fixed.csv with per-row PERMNO/GVKEY/CIK
|
|
87
|
+
|
|
88
|
+
xfinlink serve # local HTTP API on http://localhost:8000
|
|
89
|
+
xfinlink mcp # MCP server for AI agents
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Example — catching the GM stitching bug:
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
$ xfinlink validate gm_panel.csv --id-column ticker --date-column date
|
|
96
|
+
10 rows checked. 0 clean. 10 issues found.
|
|
97
|
+
┏━ Coverage (entity → row count) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
98
|
+
┃ canonical_name rows ┃
|
|
99
|
+
┃ General Motors Corporation (pre-2009 bankruptcy) 5 ┃
|
|
100
|
+
┃ General Motors Company 5 ┃
|
|
101
|
+
┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛
|
|
102
|
+
Issues written to xfinlink_validation_report.csv
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Python API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from xfinlink import resolve, bulk_resolve, search_entities, get_lineage
|
|
109
|
+
from xfinlink import validate_dataframe, fix_dataframe
|
|
110
|
+
|
|
111
|
+
# Single resolve, with optional historical date
|
|
112
|
+
r = resolve("ticker", "AAPL")
|
|
113
|
+
r = resolve("ticker", "GM", as_of_date="2008-01-01")
|
|
114
|
+
|
|
115
|
+
# Batch resolve
|
|
116
|
+
results = bulk_resolve([
|
|
117
|
+
{"id_type": "ticker", "id_value": "AAPL"},
|
|
118
|
+
{"id_type": "gvkey", "id_value": "001690"},
|
|
119
|
+
{"id_type": "cik", "id_value": "0000320193"},
|
|
120
|
+
])
|
|
121
|
+
|
|
122
|
+
# Panel validation — the headline feature
|
|
123
|
+
import pandas as pd
|
|
124
|
+
df = pd.read_csv("gm_panel.csv")
|
|
125
|
+
report = validate_dataframe(df, "ticker", date_column="date")
|
|
126
|
+
if not report.is_clean:
|
|
127
|
+
print(report.summary)
|
|
128
|
+
print(report.coverage) # {"General Motors Corporation (pre-2009 bankruptcy)": 5, ...}
|
|
129
|
+
fixed = fix_dataframe(df, "ticker", date_column="date")
|
|
130
|
+
# fixed["_permno"], fixed["_gvkey"], fixed["_cik"], fixed["_xfinlink_flag"]
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## For AI agents
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
xfinlink mcp # start the MCP server over stdio
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
xfinlink exposes five MCP tools — `resolve_identifier`, `bulk_resolve`, `entity_lineage`, `search_entity`, `validate_identifiers` — connected directly to the local crosswalk database. See [CLAUDE.md](CLAUDE.md) for tool descriptions, the GM/FB/META disambiguation cases, and guidance on when to use each tool.
|
|
140
|
+
|
|
141
|
+
## Local HTTP server
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
xfinlink serve --port 8000
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Endpoints:
|
|
148
|
+
|
|
149
|
+
| Method | Path | Purpose |
|
|
150
|
+
|---|---|---|
|
|
151
|
+
| POST | /v1/resolve | single identifier resolution |
|
|
152
|
+
| POST | /v1/resolve/bulk | batch resolution (max 1000) |
|
|
153
|
+
| GET | /v1/lineage/{entity_id} | corporate event history |
|
|
154
|
+
| GET | /v1/search?q=...&limit=... | name search |
|
|
155
|
+
| POST | /v1/validate | panel validation |
|
|
156
|
+
| GET | /v1/health | DB stats |
|
|
157
|
+
|
|
158
|
+
Interactive docs at http://localhost:8000/docs.
|
|
159
|
+
|
|
160
|
+
## Data sources
|
|
161
|
+
|
|
162
|
+
- **Without WRDS:** CIK, ticker, CUSIP (SEC EDGAR), ISIN, FIGI (OpenFIGI).
|
|
163
|
+
- **With WRDS:** adds PERMNO, GVKEY, IBES ticker, and the CCM linking table, plus corporate event detection (bankruptcy / relisting pairs).
|
|
164
|
+
|
|
165
|
+
## Legal
|
|
166
|
+
|
|
167
|
+
xfinlink does not redistribute any proprietary data. When you run `xfinlink build --wrds`, it connects to **your** WRDS account and builds the crosswalk locally from data you are licensed to access. The pip package contains only source code — no CRSP, Compustat, IBES, or EDGAR data is shipped.
|
|
168
|
+
|
|
169
|
+
## License
|
|
170
|
+
|
|
171
|
+
xfinlink is distributed under a commercial license with a trial mode: 500 resolutions per session, all features available. Get a license key at https://xfinlink.io and drop it in `~/.xfinlink/license.key` to remove the trial limit.
|
xfinlink-0.1.1/README.md
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# xfinlink
|
|
2
|
+
|
|
3
|
+
Financial entity identifier resolution. Resolve, validate, and fix identifier mappings across CRSP, Compustat, SEC EDGAR, IBES, and more.
|
|
4
|
+
|
|
5
|
+
## The problem
|
|
6
|
+
|
|
7
|
+
Financial data is spread across databases that use different identifiers. CRSP uses PERMNOs. Compustat uses GVKEYs. SEC EDGAR uses CIKs. Markets use tickers. Tickers get recycled — GM before 2009 (General Motors Corporation) and GM after 2010 (General Motors Company) are legally distinct entities that happen to share a ticker. A naive merge silently produces wrong data.
|
|
8
|
+
|
|
9
|
+
Nearly 10,000 tickers in the CRSP universe map to more than one distinct entity over time. The bug hides in silence: the merge succeeds, the DataFrame has the right number of rows, the regression runs, and the results are quietly wrong.
|
|
10
|
+
|
|
11
|
+
## What xfinlink does
|
|
12
|
+
|
|
13
|
+
- **Resolve** — any identifier → all other identifiers for the same entity, correctly matched at any point in time
|
|
14
|
+
- **Validate** — audit a merged panel and flag silent identifier errors (ticker recycling, outside-validity dates, ambiguous matches)
|
|
15
|
+
- **Fix** — return a corrected DataFrame with proper per-row entity, PERMNO, GVKEY, and CIK columns
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install xfinlink
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
WRDS ingestion is optional:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install "xfinlink[wrds]"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
xfinlink init # start postgres + run migrations
|
|
33
|
+
xfinlink build # ingest EDGAR (public)
|
|
34
|
+
xfinlink build --wrds # optional: add CRSP/Compustat/IBES
|
|
35
|
+
|
|
36
|
+
xfinlink status # DB census + identifier coverage
|
|
37
|
+
xfinlink resolve ticker AAPL # → Apple Inc with all linked ids
|
|
38
|
+
xfinlink resolve ticker GM --date 2008-01-01
|
|
39
|
+
# → General Motors Corporation (pre-bankruptcy)
|
|
40
|
+
xfinlink resolve ticker GM --date 2020-01-01
|
|
41
|
+
# → General Motors Company (post-bankruptcy)
|
|
42
|
+
|
|
43
|
+
xfinlink validate panel.csv --id-column ticker --date-column date
|
|
44
|
+
# → flags stitched/ambiguous/not-found rows
|
|
45
|
+
xfinlink fix panel.csv --id-column ticker --date-column date
|
|
46
|
+
# → writes panel_fixed.csv with per-row PERMNO/GVKEY/CIK
|
|
47
|
+
|
|
48
|
+
xfinlink serve # local HTTP API on http://localhost:8000
|
|
49
|
+
xfinlink mcp # MCP server for AI agents
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Example — catching the GM stitching bug:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
$ xfinlink validate gm_panel.csv --id-column ticker --date-column date
|
|
56
|
+
10 rows checked. 0 clean. 10 issues found.
|
|
57
|
+
┏━ Coverage (entity → row count) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
58
|
+
┃ canonical_name rows ┃
|
|
59
|
+
┃ General Motors Corporation (pre-2009 bankruptcy) 5 ┃
|
|
60
|
+
┃ General Motors Company 5 ┃
|
|
61
|
+
┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛
|
|
62
|
+
Issues written to xfinlink_validation_report.csv
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Python API
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from xfinlink import resolve, bulk_resolve, search_entities, get_lineage
|
|
69
|
+
from xfinlink import validate_dataframe, fix_dataframe
|
|
70
|
+
|
|
71
|
+
# Single resolve, with optional historical date
|
|
72
|
+
r = resolve("ticker", "AAPL")
|
|
73
|
+
r = resolve("ticker", "GM", as_of_date="2008-01-01")
|
|
74
|
+
|
|
75
|
+
# Batch resolve
|
|
76
|
+
results = bulk_resolve([
|
|
77
|
+
{"id_type": "ticker", "id_value": "AAPL"},
|
|
78
|
+
{"id_type": "gvkey", "id_value": "001690"},
|
|
79
|
+
{"id_type": "cik", "id_value": "0000320193"},
|
|
80
|
+
])
|
|
81
|
+
|
|
82
|
+
# Panel validation — the headline feature
|
|
83
|
+
import pandas as pd
|
|
84
|
+
df = pd.read_csv("gm_panel.csv")
|
|
85
|
+
report = validate_dataframe(df, "ticker", date_column="date")
|
|
86
|
+
if not report.is_clean:
|
|
87
|
+
print(report.summary)
|
|
88
|
+
print(report.coverage) # {"General Motors Corporation (pre-2009 bankruptcy)": 5, ...}
|
|
89
|
+
fixed = fix_dataframe(df, "ticker", date_column="date")
|
|
90
|
+
# fixed["_permno"], fixed["_gvkey"], fixed["_cik"], fixed["_xfinlink_flag"]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## For AI agents
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
xfinlink mcp # start the MCP server over stdio
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
xfinlink exposes five MCP tools — `resolve_identifier`, `bulk_resolve`, `entity_lineage`, `search_entity`, `validate_identifiers` — connected directly to the local crosswalk database. See [CLAUDE.md](CLAUDE.md) for tool descriptions, the GM/FB/META disambiguation cases, and guidance on when to use each tool.
|
|
100
|
+
|
|
101
|
+
## Local HTTP server
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
xfinlink serve --port 8000
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Endpoints:
|
|
108
|
+
|
|
109
|
+
| Method | Path | Purpose |
|
|
110
|
+
|---|---|---|
|
|
111
|
+
| POST | /v1/resolve | single identifier resolution |
|
|
112
|
+
| POST | /v1/resolve/bulk | batch resolution (max 1000) |
|
|
113
|
+
| GET | /v1/lineage/{entity_id} | corporate event history |
|
|
114
|
+
| GET | /v1/search?q=...&limit=... | name search |
|
|
115
|
+
| POST | /v1/validate | panel validation |
|
|
116
|
+
| GET | /v1/health | DB stats |
|
|
117
|
+
|
|
118
|
+
Interactive docs at http://localhost:8000/docs.
|
|
119
|
+
|
|
120
|
+
## Data sources
|
|
121
|
+
|
|
122
|
+
- **Without WRDS:** CIK, ticker, CUSIP (SEC EDGAR), ISIN, FIGI (OpenFIGI).
|
|
123
|
+
- **With WRDS:** adds PERMNO, GVKEY, IBES ticker, and the CCM linking table, plus corporate event detection (bankruptcy / relisting pairs).
|
|
124
|
+
|
|
125
|
+
## Legal
|
|
126
|
+
|
|
127
|
+
xfinlink does not redistribute any proprietary data. When you run `xfinlink build --wrds`, it connects to **your** WRDS account and builds the crosswalk locally from data you are licensed to access. The pip package contains only source code — no CRSP, Compustat, IBES, or EDGAR data is shipped.
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
xfinlink is distributed under a commercial license with a trial mode: 500 resolutions per session, all features available. Get a license key at https://xfinlink.io and drop it in `~/.xfinlink/license.key` to remove the trial limit.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "xfinlink"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "Financial entity identifier resolution — resolve, validate, and fix PERMNO/GVKEY/CIK/CUSIP/ticker/ISIN mappings across CRSP, Compustat, EDGAR, and more."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
authors = [{name = "xfinlink"}]
|
|
8
|
+
license = "LicenseRef-Proprietary"
|
|
9
|
+
keywords = ["finance", "CRSP", "Compustat", "EDGAR", "PERMNO", "GVKEY", "CUSIP", "ticker", "identifier", "resolver"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 4 - Beta",
|
|
12
|
+
"Intended Audience :: Financial and Insurance Industry",
|
|
13
|
+
"Intended Audience :: Science/Research",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Topic :: Office/Business :: Financial",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"pandas",
|
|
23
|
+
"pyarrow",
|
|
24
|
+
"sqlalchemy>=2.0",
|
|
25
|
+
"psycopg2-binary",
|
|
26
|
+
"asyncpg",
|
|
27
|
+
"greenlet",
|
|
28
|
+
"fastapi",
|
|
29
|
+
"uvicorn",
|
|
30
|
+
"httpx",
|
|
31
|
+
"alembic",
|
|
32
|
+
"pydantic",
|
|
33
|
+
"python-dotenv",
|
|
34
|
+
"typer[all]",
|
|
35
|
+
"mcp",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
wrds = ["wrds"]
|
|
40
|
+
dev = ["pytest", "pytest-asyncio"]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://xfinlink.io"
|
|
44
|
+
Documentation = "https://xfinlink.io/docs"
|
|
45
|
+
|
|
46
|
+
[build-system]
|
|
47
|
+
requires = ["setuptools>=68"]
|
|
48
|
+
build-backend = "setuptools.build_meta"
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
xfinlink = "xfinlink.cli:app"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
include = ["xfinlink*"]
|
|
55
|
+
exclude = ["tests*", "data*", "scripts*"]
|
|
56
|
+
|
|
57
|
+
[tool.setuptools.package-data]
|
|
58
|
+
xfinlink = [
|
|
59
|
+
"py.typed",
|
|
60
|
+
"alembic.ini",
|
|
61
|
+
"migrations/*.py",
|
|
62
|
+
"migrations/*.mako",
|
|
63
|
+
"migrations/README",
|
|
64
|
+
"migrations/versions/*.py",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
testpaths = ["tests"]
|
|
69
|
+
asyncio_mode = "auto"
|
xfinlink-0.1.1/setup.cfg
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""xfinlink — financial entity identifier resolution.
|
|
2
|
+
|
|
3
|
+
Public Python API. Synchronous wrappers around the async resolver so
|
|
4
|
+
callers do not have to manage event loops or database sessions.
|
|
5
|
+
|
|
6
|
+
from xfinlink import resolve, bulk_resolve, search_entities, get_lineage
|
|
7
|
+
|
|
8
|
+
r = resolve("ticker", "AAPL")
|
|
9
|
+
r = resolve("ticker", "GM", as_of_date="2008-01-01")
|
|
10
|
+
results = bulk_resolve([{"id_type": "ticker", "id_value": "AAPL"}, ...])
|
|
11
|
+
hits = search_entities("General Motors")
|
|
12
|
+
lineage = get_lineage("ticker", "GM")
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
from datetime import date, datetime
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from xfinlink._db_check import check_db_ready as _check_db_ready
|
|
21
|
+
from xfinlink.db import AsyncSessionLocal
|
|
22
|
+
from xfinlink.panel import ValidationReport, fix_dataframe, validate_dataframe
|
|
23
|
+
from xfinlink.resolver import core as _core
|
|
24
|
+
from xfinlink.resolver.core import (
|
|
25
|
+
EntityLineage,
|
|
26
|
+
LineageEvent,
|
|
27
|
+
ResolutionResult,
|
|
28
|
+
ResolvedEntity,
|
|
29
|
+
ResolvedIdentifier,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"resolve",
|
|
34
|
+
"bulk_resolve",
|
|
35
|
+
"search_entities",
|
|
36
|
+
"get_lineage",
|
|
37
|
+
"validate_dataframe",
|
|
38
|
+
"fix_dataframe",
|
|
39
|
+
"ValidationReport",
|
|
40
|
+
"ResolutionResult",
|
|
41
|
+
"ResolvedEntity",
|
|
42
|
+
"ResolvedIdentifier",
|
|
43
|
+
"EntityLineage",
|
|
44
|
+
"LineageEvent",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _to_date(value: Any) -> date | None:
|
|
49
|
+
if value is None or isinstance(value, date) and not isinstance(value, datetime):
|
|
50
|
+
return value
|
|
51
|
+
if isinstance(value, datetime):
|
|
52
|
+
return value.date()
|
|
53
|
+
if isinstance(value, str):
|
|
54
|
+
return datetime.strptime(value, "%Y-%m-%d").date()
|
|
55
|
+
raise TypeError(f"unsupported date type: {type(value).__name__}")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _run(coro):
|
|
59
|
+
return asyncio.run(coro)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def resolve(
|
|
63
|
+
id_type: str,
|
|
64
|
+
id_value: str,
|
|
65
|
+
as_of_date: str | date | None = None,
|
|
66
|
+
) -> ResolutionResult:
|
|
67
|
+
"""Resolve a single identifier to its entity."""
|
|
68
|
+
_check_db_ready()
|
|
69
|
+
as_of = _to_date(as_of_date)
|
|
70
|
+
|
|
71
|
+
async def _inner() -> ResolutionResult:
|
|
72
|
+
async with AsyncSessionLocal() as session:
|
|
73
|
+
return await _core.resolve(session, id_type, id_value, as_of)
|
|
74
|
+
|
|
75
|
+
return _run(_inner())
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def bulk_resolve(requests: list[dict]) -> list[ResolutionResult]:
|
|
79
|
+
"""Resolve a batch of identifier requests."""
|
|
80
|
+
_check_db_ready()
|
|
81
|
+
normalized: list[dict] = []
|
|
82
|
+
for req in requests:
|
|
83
|
+
r = dict(req)
|
|
84
|
+
if "as_of_date" in r:
|
|
85
|
+
r["as_of_date"] = _to_date(r["as_of_date"])
|
|
86
|
+
normalized.append(r)
|
|
87
|
+
|
|
88
|
+
async def _inner() -> list[ResolutionResult]:
|
|
89
|
+
async with AsyncSessionLocal() as session:
|
|
90
|
+
return await _core.bulk_resolve(session, normalized)
|
|
91
|
+
|
|
92
|
+
return _run(_inner())
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def search_entities(query: str, limit: int = 10) -> list[ResolvedEntity]:
|
|
96
|
+
"""Fuzzy-search entities by canonical name."""
|
|
97
|
+
_check_db_ready()
|
|
98
|
+
|
|
99
|
+
async def _inner() -> list[ResolvedEntity]:
|
|
100
|
+
async with AsyncSessionLocal() as session:
|
|
101
|
+
return await _core.search_entities(session, query, limit=limit)
|
|
102
|
+
|
|
103
|
+
return _run(_inner())
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_lineage(
|
|
107
|
+
id_type: str,
|
|
108
|
+
id_value: str,
|
|
109
|
+
as_of_date: str | date | None = None,
|
|
110
|
+
) -> EntityLineage | None:
|
|
111
|
+
"""Return the corporate lineage for the entity matching an identifier.
|
|
112
|
+
|
|
113
|
+
Resolves `(id_type, id_value)` first, then returns the lineage starting
|
|
114
|
+
from the top-ranked match. The async traversal walks ticker-siblings,
|
|
115
|
+
so both sides of bankruptcy/relisting pairs end up in one lineage
|
|
116
|
+
regardless of which match is used as the seed. Returns None if the
|
|
117
|
+
identifier did not match any entity.
|
|
118
|
+
"""
|
|
119
|
+
_check_db_ready()
|
|
120
|
+
as_of = _to_date(as_of_date)
|
|
121
|
+
|
|
122
|
+
async def _inner() -> EntityLineage | None:
|
|
123
|
+
async with AsyncSessionLocal() as session:
|
|
124
|
+
result = await _core.resolve(session, id_type, id_value, as_of)
|
|
125
|
+
if result.not_found:
|
|
126
|
+
return None
|
|
127
|
+
return await _core.get_lineage(session, result.matches[0].entity_id)
|
|
128
|
+
|
|
129
|
+
return _run(_inner())
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Shared DB-readiness check used by the public API and panel module."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import func, select
|
|
5
|
+
from sqlalchemy.exc import OperationalError
|
|
6
|
+
|
|
7
|
+
from xfinlink.db import SessionLocal
|
|
8
|
+
from xfinlink.models import Entity
|
|
9
|
+
|
|
10
|
+
_DB_ERROR = (
|
|
11
|
+
"Database not found or empty. "
|
|
12
|
+
"Run `xfinlink init && xfinlink build` first."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def check_db_ready() -> None:
|
|
17
|
+
try:
|
|
18
|
+
with SessionLocal() as s:
|
|
19
|
+
n = s.scalar(select(func.count()).select_from(Entity))
|
|
20
|
+
except OperationalError as exc:
|
|
21
|
+
raise RuntimeError(_DB_ERROR) from exc
|
|
22
|
+
if not n:
|
|
23
|
+
raise RuntimeError(_DB_ERROR)
|