adminlineage 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adminlineage-0.2.0/LICENSE +21 -0
- adminlineage-0.2.0/PKG-INFO +627 -0
- adminlineage-0.2.0/README.md +590 -0
- adminlineage-0.2.0/pyproject.toml +68 -0
- adminlineage-0.2.0/setup.cfg +4 -0
- adminlineage-0.2.0/src/adminlineage/__init__.py +15 -0
- adminlineage-0.2.0/src/adminlineage/__main__.py +6 -0
- adminlineage-0.2.0/src/adminlineage/api.py +150 -0
- adminlineage-0.2.0/src/adminlineage/candidates.py +131 -0
- adminlineage-0.2.0/src/adminlineage/cli.py +169 -0
- adminlineage-0.2.0/src/adminlineage/config.py +129 -0
- adminlineage-0.2.0/src/adminlineage/export.py +44 -0
- adminlineage-0.2.0/src/adminlineage/io.py +154 -0
- adminlineage-0.2.0/src/adminlineage/llm/__init__.py +21 -0
- adminlineage-0.2.0/src/adminlineage/llm/base.py +50 -0
- adminlineage-0.2.0/src/adminlineage/llm/cache.py +108 -0
- adminlineage-0.2.0/src/adminlineage/llm/gemini.py +692 -0
- adminlineage-0.2.0/src/adminlineage/llm/mock.py +111 -0
- adminlineage-0.2.0/src/adminlineage/llm/retry.py +32 -0
- adminlineage-0.2.0/src/adminlineage/logging_utils.py +36 -0
- adminlineage-0.2.0/src/adminlineage/models.py +249 -0
- adminlineage-0.2.0/src/adminlineage/normalize.py +80 -0
- adminlineage-0.2.0/src/adminlineage/pipeline.py +1261 -0
- adminlineage-0.2.0/src/adminlineage/pipeline_adjudication.py +534 -0
- adminlineage-0.2.0/src/adminlineage/pipeline_materialization.py +210 -0
- adminlineage-0.2.0/src/adminlineage/pipeline_second_stage.py +716 -0
- adminlineage-0.2.0/src/adminlineage/prompts.py +211 -0
- adminlineage-0.2.0/src/adminlineage/replay.py +230 -0
- adminlineage-0.2.0/src/adminlineage/review.py +107 -0
- adminlineage-0.2.0/src/adminlineage/schema.py +142 -0
- adminlineage-0.2.0/src/adminlineage/utils.py +104 -0
- adminlineage-0.2.0/src/adminlineage/validation.py +215 -0
- adminlineage-0.2.0/src/adminlineage.egg-info/PKG-INFO +627 -0
- adminlineage-0.2.0/src/adminlineage.egg-info/SOURCES.txt +36 -0
- adminlineage-0.2.0/src/adminlineage.egg-info/dependency_links.txt +1 -0
- adminlineage-0.2.0/src/adminlineage.egg-info/entry_points.txt +2 -0
- adminlineage-0.2.0/src/adminlineage.egg-info/requires.txt +18 -0
- adminlineage-0.2.0/src/adminlineage.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Taha Ibrahim Siddiqui
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,627 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: adminlineage
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Build administrative evolution keys across time with exact-match constrained Gemini adjudication
|
|
5
|
+
Author: AdminLineage Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/TahaIbrahimSiddiqui/AdminLineageAI
|
|
8
|
+
Project-URL: Repository, https://github.com/TahaIbrahimSiddiqui/AdminLineageAI
|
|
9
|
+
Project-URL: Issues, https://github.com/TahaIbrahimSiddiqui/AdminLineageAI/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: pandas>=2.0
|
|
21
|
+
Requires-Dist: pydantic>=2.7
|
|
22
|
+
Requires-Dist: PyYAML>=6.0
|
|
23
|
+
Requires-Dist: python-dotenv>=1.0
|
|
24
|
+
Requires-Dist: google-genai>=0.7
|
|
25
|
+
Provides-Extra: io
|
|
26
|
+
Requires-Dist: pyarrow>=15.0; extra == "io"
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
29
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
30
|
+
Requires-Dist: pandas-stubs>=2.2; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
33
|
+
Requires-Dist: twine>=5.1; extra == "dev"
|
|
34
|
+
Requires-Dist: types-PyYAML>=6.0; extra == "dev"
|
|
35
|
+
Requires-Dist: vulture>=2.16; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# AdminLineageAI
|
|
39
|
+
|
|
40
|
+
AdminLineageAI makes crosswalks between administrative locations such as districts (ADM2), subdistricts (ADM3), states (ADM1), and countries (ADM0) across datasets that may come from completely different sources and different periods. It uses AI to compare likely matches, reason over spelling variants and language-specific forms, administrative split/merges/renames and produce a usable crosswalk plus review artifacts.
|
|
41
|
+
|
|
42
|
+
Matching administrative units by hand is labour-intensive work. Through this package, we hope to reduce the manual work of matching administrative units between datasets while still keeping a clear review trail and reproducibility.
|
|
43
|
+
|
|
44
|
+
The package generates candidate matches between two datasets, asks Gemini to choose among them, and writes a crosswalk plus review artifacts. It outputs a final evolution key plus review files as CSV and Parquet.
|
|
45
|
+
|
|
46
|
+
<p align="center">
|
|
47
|
+
<img alt="This is an experimental utility. Treat these crosswalks as assistive outputs and cross-verify them, especially in important cases." src="https://img.shields.io/static/v1?label=This%20is%20an%20experimental%20utility.&message=Treat%20these%20crosswalks%20as%20assistive%20outputs%20and%20cross-verify%20them%2C%20especially%20in%20important%20cases.&color=red">
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
## Possible use cases
|
|
51
|
+
|
|
52
|
+
Below are few possible scenarios where this package can be of assistance. Moreover, we would love to hear about other user experiences and use cases for this package.
|
|
53
|
+
|
|
54
|
+
- For instance, one has scheme dataset from a government scheme and need to match it against a standard administrative list such as a census table. The scheme source may write `Paschimi Singhbhum` while another uses `West Singhbhum`. Plain fuzzy matching will miss cases like this unless you manually standardize prefixes and suffixes first. While AI can do matching for this because it has context that `paschim` in Hindi means `west`. The same kind of issue shows up across many widely spoken languages.
|
|
55
|
+
- Handling administrative churn. Districts and other units are regularly split, merged, renamed, or grouped differently, and there is often no up-to-date public evolution list for newly created units, the package does a wide google search and find possible predessor or sucessor for each administrative unit in the primary dataset
|
|
56
|
+
- Creating entirely new evolution crosswalks that do not exist between two time period at an administrative level.
|
|
57
|
+
|
|
58
|
+
## Important Features
|
|
59
|
+
|
|
60
|
+
- The default setting of the package is set to have best results with minimal token cost. Please feel free to change them according to your needs.
|
|
61
|
+
- To keep the token costs minimal, we do exact string match plus pruning of matching candidates on the primary side before first stage.
|
|
62
|
+
- Hierarchical matching with `exact_match`. If your data are nested, you can match names within exact scopes such as `country`, `state`, or `district`. For example, you can choose to match only district names within a states or subdistricts with a district. This works well, but the exact-match column string need to line up exactly across both datasets.
|
|
63
|
+
- Replay and reproducibility. Academic pipelines often need to be rerun many times. With replay enabled, repeated semantic requests can reuse prior completed LLM work instead of calling the API again. The `seed` parameter helps keep request identity deterministic and makes reruns easier to reproduce.
|
|
64
|
+
|
|
65
|
+
The supported live workflow in AdminLineageAI is:
|
|
66
|
+
|
|
67
|
+
- Compatible with any `gemini-3+` model
|
|
68
|
+
- Google Search grounding enabled
|
|
69
|
+
- strict JSON output from the model
|
|
70
|
+
- user-controlled batching with automatic split fallback on failed multi-row requests
|
|
71
|
+
- an optional bounded second-stage rescue pass for unmatched rows when `string_exact_match_prune`
|
|
72
|
+
is set to `from` or `to`
|
|
73
|
+
|
|
74
|
+
The bounded second stage works like this:
|
|
75
|
+
|
|
76
|
+
- first pass still does the normal grounded shortlist adjudication
|
|
77
|
+
- if `string_exact_match_prune="from"`, the rescue pass revisits rows with `merge="only_in_from"`
|
|
78
|
+
- if `string_exact_match_prune="to"`, it revisits rows with `merge="only_in_to"`
|
|
79
|
+
- it runs one grounded research call to look for a predecessor or successor name
|
|
80
|
+
- if that research comes back as `unknown` with no lineage hint, the row is left alone and the
|
|
81
|
+
rescue pass stops there
|
|
82
|
+
- otherwise it searches the full opposite table, rebuilds a short global shortlist, and runs one
|
|
83
|
+
final strict JSON decision call without additional search grounding
|
|
84
|
+
- the second stage is sequential, one-pass, resumable, and writes `second_stage_results.jsonl`
|
|
85
|
+
|
|
86
|
+
## How To Use
|
|
87
|
+
|
|
88
|
+
You do not need the CLI to use AdminLineageAI. The simplest path is the Python API.
|
|
89
|
+
|
|
90
|
+
1. Install the published package.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install adminlineage
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Install the optional parquet dependency if you want parquet output support:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install "adminlineage[io]"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
2. Set a Gemini API key in `GEMINI_API_KEY`, or use another environment variable name and pass it explicitly.
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
GEMINI_API_KEY=your_api_key_here
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
The package can load a nearby `.env` file when it looks for the key.
|
|
109
|
+
|
|
110
|
+
3. Choose the name column on each side, and add optional exact-match columns, IDs, or extra context columns if you have them.
|
|
111
|
+
|
|
112
|
+
5. Run the matcher.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import pandas as pd
|
|
116
|
+
import adminlineage
|
|
117
|
+
|
|
118
|
+
df_from = pd.read_csv("from_units.csv")
|
|
119
|
+
df_to = pd.read_csv("to_units.csv")
|
|
120
|
+
|
|
121
|
+
crosswalk_df, metadata = adminlineage.build_evolution_key(
|
|
122
|
+
df_from,
|
|
123
|
+
df_to,
|
|
124
|
+
country="India",
|
|
125
|
+
year_from=1951,
|
|
126
|
+
year_to=2001,
|
|
127
|
+
map_col_from="district",
|
|
128
|
+
map_col_to="district",
|
|
129
|
+
exact_match=["state"],
|
|
130
|
+
id_col_from="unit_id",
|
|
131
|
+
id_col_to="unit_id",
|
|
132
|
+
relationship="auto",
|
|
133
|
+
string_exact_match_prune="from",
|
|
134
|
+
evidence=False,
|
|
135
|
+
reason=False,
|
|
136
|
+
model="gemini-3.1-flash-lite-preview",
|
|
137
|
+
gemini_api_key_env="GEMINI_API_KEY",
|
|
138
|
+
replay_enabled=True,
|
|
139
|
+
seed=42,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
print(crosswalk_df[["from_name", "to_name", "merge", "score"]].head())
|
|
143
|
+
print(metadata["artifacts"])
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
6. Review the outputs. By default, AdminLineageAI writes artifacts under `outputs/<country>_<year_from>_<year_to>_<map_col_from>`. The main ones are `evolution_key.csv`, `review_queue.csv`, and `run_metadata.json`.
|
|
147
|
+
|
|
148
|
+
## Common Options
|
|
149
|
+
|
|
150
|
+
- `exact_match`: Restricts matching to rows that agree exactly on one or more scope columns such as `country`, `state`, or `district`.
|
|
151
|
+
- `string_exact_match_prune`: Controls how aggressively exact string hits are removed from later AI work. Use this to control token spend.
|
|
152
|
+
- `relationship`: Declares the kind of relationship you expect, or leave it as `auto`.
|
|
153
|
+
- `max_candidates`: Limits how many candidate rows are shown to the model for each source row. The default is 6.
|
|
154
|
+
- `evidence`: Adds a short factual summary column.
|
|
155
|
+
- `reason`: Adds a longer explanation column.
|
|
156
|
+
- `replay_enabled`: Reuses prior completed LLM work when the semantic request matches.
|
|
157
|
+
- `seed`: Keeps request identity deterministic for more reproducible reruns.
|
|
158
|
+
- `output_dir`: Changes where run artifacts are written.
|
|
159
|
+
|
|
160
|
+
## Matching Flow Example
|
|
161
|
+
|
|
162
|
+
This example follows a nested district-level match inside `India > Uttar Pradesh` from `2011` to `2025`. Here `string_exact_match_prune='to'` (this set `to` as primary side and `from` as secondary side where all candidates stay global).
|
|
163
|
+
|
|
164
|
+
```mermaid
|
|
165
|
+
flowchart TD
|
|
166
|
+
A["From table (2011)<br/>India / Uttar Pradesh / Agra<br/>India / Uttar Pradesh / Kanpur Dehat<br/>India / Uttar Pradesh / Faizabad<br/>India / Uttar Pradesh / Allahabad"]
|
|
167
|
+
B["To table (2025)<br/>India / Uttar Pradesh / Agra<br/>India / Uttar Pradesh / Kanpur Rural<br/>India / Uttar Pradesh / Ayodhya<br/>India / Uttar Pradesh / Prayagraj"]
|
|
168
|
+
C["Nested settings<br/>map_col='district'<br/>exact_match=['state']<br/>string_exact_match_prune='to'<br/>this set 'to' as primary side<br/>and 'from' as secondary side<br/>where all candidates stay global"]
|
|
169
|
+
D["Validate inputs and normalize names"]
|
|
170
|
+
E["Exact string match pruning before LLM"]
|
|
171
|
+
F["Agra -> Agra<br/>no LLM used here<br/>just exact string match"]
|
|
172
|
+
H["AI matches remaining rows on primary side<br/>(Kanpur Rural, Ayodhya, Prayagraj)<br/>using grounded Gemini search<br/>"]
|
|
173
|
+
I["AI matches Kanpur Dehat -> Kanpur Rural<br/>because it has context that 'dehat' means 'rural' in Hindi"]
|
|
174
|
+
J{"Do Ayodhya or Prayagraj stay unmatched<br/>after first stage?"}
|
|
175
|
+
L["Do intensive Gemini search of potential predecessor / successor of Ayodhya / Prayagraj<br/>if they were renamed, merged, split, or transferred"]
|
|
176
|
+
M["If Gemini finds a potential predecessor / successor for that district<br/>match it with the global district list from the secondary side"]
|
|
177
|
+
N["Write final evolution key<br/>Agra -> Agra<br/>Kanpur Dehat -> Kanpur Rural<br/>Faizabad -> Ayodhya<br/>Allahabad -> Prayagraj"]
|
|
178
|
+
O["Write artifacts<br/>evolution_key.csv<br/>review_queue.csv<br/>run_metadata.json<br/>replay bundle"]
|
|
179
|
+
|
|
180
|
+
subgraph G["First stage"]
|
|
181
|
+
H
|
|
182
|
+
I
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
subgraph P["Second stage"]
|
|
186
|
+
L
|
|
187
|
+
M
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
A --> C
|
|
191
|
+
B --> C
|
|
192
|
+
C --> D
|
|
193
|
+
D --> E
|
|
194
|
+
E --> F
|
|
195
|
+
E --> H
|
|
196
|
+
H --> I
|
|
197
|
+
I --> J
|
|
198
|
+
J -- "No" --> N
|
|
199
|
+
J -- "Yes" --> L
|
|
200
|
+
L --> M
|
|
201
|
+
A --> N
|
|
202
|
+
B --> N
|
|
203
|
+
M --> N
|
|
204
|
+
N --> O
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Hand Check Against Scheme Ground Truth
|
|
208
|
+
|
|
209
|
+
This is a quick hand check against a human-made evolution key for a government scheme implemented nationally in India. The scheme side is `2025` districts, mapped back to their predecessor `2011` districts.
|
|
210
|
+
|
|
211
|
+
The comparison is oriented from the scheme side: for each `district_2025` in the hand key, does the evolution key recover the expected `district_2011` predecessor? Names were normalized before comparison. Spelling and transliteration-only differences were treated as aligns. A row counts as a match only when the evolution key has a non-blank `from_name`.
|
|
212
|
+
|
|
213
|
+
- `aligns` means the evolution key points to the same 2011 district name
|
|
214
|
+
- `disagrees` means the evolution key points to a different 2011 district
|
|
215
|
+
- `no match` means the evolution key does not provide any non-blank `from_name`
|
|
216
|
+
|
|
217
|
+
| Outcome | Count | Share of 612 hand-coded district pairs |
|
|
218
|
+
|---|---:|---:|
|
|
219
|
+
| Aligns with scheme hand mapping | 595 | 97.22% |
|
|
220
|
+
| Disagrees with scheme hand mapping | 11 | 1.80% |
|
|
221
|
+
| Evolution key provides no 2011 match | 6 | 0.98% |
|
|
222
|
+
|
|
223
|
+
Takeaway: most scheme districts map back to the same 2011 predecessor as the hand key, a few disagree, and a small number have no match. Treat this as a sanity check, not a full audit.
|
|
224
|
+
|
|
225
|
+
## Optional CLI Workflow
|
|
226
|
+
|
|
227
|
+
The CLI is useful when you want a saved YAML config for repeatable runs, but it is optional.
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
adminlineage preview --config examples/config/example.yml
|
|
231
|
+
adminlineage validate --config examples/config/example.yml
|
|
232
|
+
adminlineage run --config examples/config/example.yml
|
|
233
|
+
adminlineage export --input outputs/india_1951_2001_subdistrict/evolution_key.csv --format jsonl
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
The package includes these example assets:
|
|
237
|
+
|
|
238
|
+
- `examples/config/example.yml`
|
|
239
|
+
- `examples/loaders/sample_loader.py`
|
|
240
|
+
- `examples/adminlineage_gemini_3_1_flash_lite.ipynb`
|
|
241
|
+
|
|
242
|
+
## Python API
|
|
243
|
+
|
|
244
|
+
Public objects available from `import adminlineage`:
|
|
245
|
+
|
|
246
|
+
- `build_evolution_key`
|
|
247
|
+
- `preview_plan`
|
|
248
|
+
- `validate_inputs`
|
|
249
|
+
- `export_crosswalk`
|
|
250
|
+
- `get_output_schema_definition`
|
|
251
|
+
- `OUTPUT_SCHEMA_VERSION`
|
|
252
|
+
- `__version__`
|
|
253
|
+
|
|
254
|
+
### `build_evolution_key`
|
|
255
|
+
|
|
256
|
+
Build the evolution key and write run artifacts.
|
|
257
|
+
|
|
258
|
+
Required arguments:
|
|
259
|
+
|
|
260
|
+
| Argument | Type | Meaning |
|
|
261
|
+
|---|---|---|
|
|
262
|
+
| `df_from` | `pd.DataFrame` | Earlier-period table |
|
|
263
|
+
| `df_to` | `pd.DataFrame` | Later-period table |
|
|
264
|
+
| `country` | `str` | Country label used in prompts and metadata |
|
|
265
|
+
| `year_from` | `int \| str` | Earlier-period label |
|
|
266
|
+
| `year_to` | `int \| str` | Later-period label |
|
|
267
|
+
| `map_col_from` | `str` | Source name column |
|
|
268
|
+
|
|
269
|
+
Optional arguments:
|
|
270
|
+
|
|
271
|
+
| Argument | Type | Default | Meaning |
|
|
272
|
+
|---|---|---|---|
|
|
273
|
+
| `map_col_to` | `str \| None` | `None` | Target name column. Falls back to `map_col_from` when omitted. |
|
|
274
|
+
| `exact_match` | `list[str] \| None` | `None` | Columns that must agree before comparison. |
|
|
275
|
+
| `id_col_from` | `str \| None` | `None` | Source ID column. |
|
|
276
|
+
| `id_col_to` | `str \| None` | `None` | Target ID column. |
|
|
277
|
+
| `extra_context_cols` | `list[str] \| None` | `None` | Extra columns added to the model payload. |
|
|
278
|
+
| `relationship` | `str` | `auto` | One of `auto`, `father_to_father`, `father_to_child`, `child_to_father`, `child_to_child`. |
|
|
279
|
+
| `string_exact_match_prune` | `str` | `none` | `none` keeps exact-string hits in later AI work, `from` removes matched source rows from AI work, `to` removes matched source and target rows from later AI work. |
|
|
280
|
+
| `evidence` | `bool` | `False` | Adds a short evidence summary and includes the `evidence` column. |
|
|
281
|
+
| `reason` | `bool` | `False` | Adds a longer explanation in the `reason` column. |
|
|
282
|
+
| `model` | `str` | `gemini-3.1-flash-lite-preview` | Gemini model name. |
|
|
283
|
+
| `gemini_api_key_env` | `str` | `GEMINI_API_KEY` | Environment variable name used for the API key. |
|
|
284
|
+
| `batch_size` | `int` | `5` | Maximum number of source rows per Gemini request. When a multi-row request fails, the pipeline retries in smaller batches. |
|
|
285
|
+
| `max_candidates` | `int` | `6` | Candidate shortlist size per source row. |
|
|
286
|
+
| `output_dir` | `str \| Path` | `outputs` | Base output directory for run artifacts. |
|
|
287
|
+
| `seed` | `int` | `42` | Deterministic seed for repeatable request identity. |
|
|
288
|
+
| `temperature` | `float` | `0.75` | Gemini temperature. |
|
|
289
|
+
| `enable_google_search` | `bool` | `True` | Enables grounded Gemini adjudication. |
|
|
290
|
+
| `request_timeout_seconds` | `int \| None` | `90` | Per-request timeout. |
|
|
291
|
+
| `env_search_dir` | `str \| Path \| None` | `None` | Starting directory used when searching for `.env`. |
|
|
292
|
+
| `replay_enabled` | `bool` | `False` | Reuses prior completed LLM work when the semantic request matches. |
|
|
293
|
+
| `replay_store_dir` | `str \| Path \| None` | `None` | Replay store path. Falls back to `.adminlineage_replay` internally when replay is enabled. |
|
|
294
|
+
|
|
295
|
+
Return value:
|
|
296
|
+
|
|
297
|
+
- `tuple[pd.DataFrame, dict]`
|
|
298
|
+
- first item: the crosswalk DataFrame
|
|
299
|
+
- second item: run metadata with counts, warnings, request details, and artifact paths
|
|
300
|
+
|
|
301
|
+
### `preview_plan`
|
|
302
|
+
|
|
303
|
+
Preview grouping and candidate-generation behavior without calling Gemini.
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
adminlineage.preview_plan(
|
|
307
|
+
df_from,
|
|
308
|
+
df_to,
|
|
309
|
+
*,
|
|
310
|
+
country,
|
|
311
|
+
year_from,
|
|
312
|
+
year_to,
|
|
313
|
+
map_col_from,
|
|
314
|
+
map_col_to=None,
|
|
315
|
+
exact_match=None,
|
|
316
|
+
id_col_from=None,
|
|
317
|
+
id_col_to=None,
|
|
318
|
+
extra_context_cols=None,
|
|
319
|
+
string_exact_match_prune="none",
|
|
320
|
+
max_candidates=6,
|
|
321
|
+
)
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
Return value: a diagnostics dict describing validity, group sizes, exact-string hits, and candidate budgets.
|
|
325
|
+
|
|
326
|
+
### `validate_inputs`
|
|
327
|
+
|
|
328
|
+
Validate the two input tables without running the pipeline.
|
|
329
|
+
|
|
330
|
+
```python
|
|
331
|
+
adminlineage.validate_inputs(
|
|
332
|
+
df_from,
|
|
333
|
+
df_to,
|
|
334
|
+
*,
|
|
335
|
+
country,
|
|
336
|
+
map_col_from,
|
|
337
|
+
map_col_to=None,
|
|
338
|
+
exact_match=None,
|
|
339
|
+
id_col_from=None,
|
|
340
|
+
id_col_to=None,
|
|
341
|
+
)
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
Return value: a diagnostics dict that reports whether the inputs are valid and what is missing or duplicated.
|
|
345
|
+
|
|
346
|
+
### `export_crosswalk`
|
|
347
|
+
|
|
348
|
+
Convert a materialized crosswalk file into another format.
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
adminlineage.export_crosswalk(
|
|
352
|
+
input_path="outputs/india_1951_2001_subdistrict/evolution_key.csv",
|
|
353
|
+
output_format="jsonl",
|
|
354
|
+
output_path=None,
|
|
355
|
+
)
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
Return value: the written output path.
|
|
359
|
+
|
|
360
|
+
Supported output formats:
|
|
361
|
+
|
|
362
|
+
- `csv`
|
|
363
|
+
- `parquet`
|
|
364
|
+
- `jsonl`
|
|
365
|
+
|
|
366
|
+
### `get_output_schema_definition`
|
|
367
|
+
|
|
368
|
+
Return a machine-readable description of the materialized output schema.
|
|
369
|
+
|
|
370
|
+
```python
|
|
371
|
+
schema = adminlineage.get_output_schema_definition(include_evidence=False)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
Arguments:
|
|
375
|
+
|
|
376
|
+
| Argument | Type | Default | Meaning |
|
|
377
|
+
|---|---|---|---|
|
|
378
|
+
| `include_evidence` | `bool` | `False` | Includes the `evidence` column in the returned schema definition. |
|
|
379
|
+
|
|
380
|
+
Return value: a dict containing the schema version, ordered output columns, required columns, and enum values, including the `merge` indicator enum.
|
|
381
|
+
|
|
382
|
+
### `OUTPUT_SCHEMA_VERSION`
|
|
383
|
+
|
|
384
|
+
String constant for the current materialized output schema version.
|
|
385
|
+
|
|
386
|
+
### `__version__`
|
|
387
|
+
|
|
388
|
+
String constant for the package version.
|
|
389
|
+
|
|
390
|
+
## Optional CLI Reference
|
|
391
|
+
|
|
392
|
+
Commands:
|
|
393
|
+
|
|
394
|
+
```bash
|
|
395
|
+
adminlineage run --config path/to/config.yml
|
|
396
|
+
adminlineage preview --config path/to/config.yml
|
|
397
|
+
adminlineage validate --config path/to/config.yml
|
|
398
|
+
adminlineage export --input path/to/evolution_key.csv --format {csv|parquet|jsonl} [--output path]
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
`preview` and `validate` do not call Gemini. `run` writes the full artifact set. `export` converts an existing materialized crosswalk file. If you are using the Python API directly, you can ignore this section.
|
|
402
|
+
|
|
403
|
+
## CLI YAML Config Reference
|
|
404
|
+
|
|
405
|
+
Top-level sections:
|
|
406
|
+
|
|
407
|
+
- `request`
|
|
408
|
+
- `data`
|
|
409
|
+
- `llm`
|
|
410
|
+
- `pipeline`
|
|
411
|
+
- `cache`
|
|
412
|
+
- `retry`
|
|
413
|
+
- `replay`
|
|
414
|
+
- `output`
|
|
415
|
+
|
|
416
|
+
### `request`
|
|
417
|
+
|
|
418
|
+
| Key | Default | Meaning |
|
|
419
|
+
|---|---|---|
|
|
420
|
+
| `country` | required | Country label used in prompts and metadata. |
|
|
421
|
+
| `year_from` | required | Earlier-period label. |
|
|
422
|
+
| `year_to` | required | Later-period label. |
|
|
423
|
+
| `map_col_from` | required | Source name column. |
|
|
424
|
+
| `map_col_to` | `null` | Target name column. Falls back to `map_col_from`. |
|
|
425
|
+
| `exact_match` | `[]` | Columns that must agree before comparison. |
|
|
426
|
+
| `id_col_from` | `null` | Source ID column. |
|
|
427
|
+
| `id_col_to` | `null` | Target ID column. |
|
|
428
|
+
| `extra_context_cols` | `[]` | Extra columns added to the model payload. |
|
|
429
|
+
| `relationship` | `auto` | Relationship mode. |
|
|
430
|
+
| `string_exact_match_prune` | `none` | Exact-string pruning mode. |
|
|
431
|
+
| `evidence` | `false` | Adds the `evidence` column. |
|
|
432
|
+
| `reason` | `false` | Adds the `reason` column. |
|
|
433
|
+
|
|
434
|
+
### `data`
|
|
435
|
+
|
|
436
|
+
| Key | Default | Meaning |
|
|
437
|
+
|---|---|---|
|
|
438
|
+
| `mode` | `files` | One of `files` or `python_hook`. |
|
|
439
|
+
| `from_path` | `null` | Required when `mode: files`. |
|
|
440
|
+
| `to_path` | `null` | Required when `mode: files`. |
|
|
441
|
+
| `callable` | `null` | Required when `mode: python_hook`. Uses `module:function` syntax. |
|
|
442
|
+
| `params` | `{}` | Arbitrary config payload passed to the loader hook. |
|
|
443
|
+
|
|
444
|
+
Loader contract for `python_hook` mode:
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
def load_data(config: dict) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
448
|
+
...
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
The included example hook is `examples/loaders/sample_loader.py`.
|
|
452
|
+
|
|
453
|
+
For file mode, `data.from_path` and `data.to_path` are resolved relative to the config file location, not your shell location.
|
|
454
|
+
|
|
455
|
+
### `llm`
|
|
456
|
+
|
|
457
|
+
| Key | Default | Meaning |
|
|
458
|
+
|---|---|---|
|
|
459
|
+
| `provider` | `gemini` | Use `gemini` for live runs or `mock` for dry runs and testing. |
|
|
460
|
+
| `model` | `gemini-3.1-flash-lite-preview` | Gemini model name. |
|
|
461
|
+
| `gemini_api_key_env` | `GEMINI_API_KEY` | Environment variable name for the API key. |
|
|
462
|
+
| `temperature` | `0.75` | Gemini temperature. |
|
|
463
|
+
| `seed` | `42` | Deterministic seed. |
|
|
464
|
+
| `enable_google_search` | `true` | Enables grounded adjudication. |
|
|
465
|
+
| `request_timeout_seconds` | `90` | Per-request timeout. |
|
|
466
|
+
|
|
467
|
+
### `pipeline`
|
|
468
|
+
|
|
469
|
+
| Key | Default | Meaning |
|
|
470
|
+
|---|---|---|
|
|
471
|
+
| `batch_size` | `5` | Maximum number of source rows per Gemini request. Failed multi-row requests are retried in smaller batches. |
|
|
472
|
+
| `max_candidates` | `6` | Candidate shortlist size per source row. You can raise this if you want a wider shortlist. |
|
|
473
|
+
| `review_score_threshold` | `0.6` | Rows below this score are flagged for review. |
|
|
474
|
+
|
|
475
|
+
### `cache`
|
|
476
|
+
|
|
477
|
+
| Key | Default | Meaning |
|
|
478
|
+
|---|---|---|
|
|
479
|
+
| `enabled` | `true` | Enables the SQLite LLM cache. |
|
|
480
|
+
| `backend` | `sqlite` | Current cache backend. |
|
|
481
|
+
| `path` | `llm_cache.sqlite` | Cache database path. |
|
|
482
|
+
|
|
483
|
+
### `retry`
|
|
484
|
+
|
|
485
|
+
| Key | Default | Meaning |
|
|
486
|
+
|---|---|---|
|
|
487
|
+
| `max_attempts` | `6` | Maximum retry attempts for transient LLM failures. |
|
|
488
|
+
| `base_delay_seconds` | `1.0` | Initial retry delay. |
|
|
489
|
+
| `max_delay_seconds` | `20.0` | Maximum retry delay. |
|
|
490
|
+
| `jitter_seconds` | `0.2` | Random jitter added to retry timing. |
|
|
491
|
+
|
|
492
|
+
### `replay`
|
|
493
|
+
|
|
494
|
+
| Key | Default | Meaning |
|
|
495
|
+
|---|---|---|
|
|
496
|
+
| `enabled` | `false` | Enables exact replay for fully completed runs. |
|
|
497
|
+
| `store_dir` | `.adminlineage_replay` | Replay bundle directory. |
|
|
498
|
+
|
|
499
|
+
Relative replay store paths are resolved from the config file location. This section only matters if you are using the CLI workflow.
|
|
500
|
+
|
|
501
|
+
### `output`
|
|
502
|
+
|
|
503
|
+
| Key | Default | Meaning |
|
|
504
|
+
|---|---|---|
|
|
505
|
+
| `write_csv` | `true` | Writes `evolution_key.csv`. |
|
|
506
|
+
| `write_parquet` | `true` | Writes `evolution_key.parquet`. |
|
|
507
|
+
|
|
508
|
+
Minimal config shape:
|
|
509
|
+
|
|
510
|
+
```yaml
|
|
511
|
+
request:
|
|
512
|
+
country: India
|
|
513
|
+
year_from: 1951
|
|
514
|
+
year_to: 2001
|
|
515
|
+
map_col_from: subdistrict
|
|
516
|
+
map_col_to: subdistrict
|
|
517
|
+
exact_match: [state, district]
|
|
518
|
+
id_col_from: unit_id
|
|
519
|
+
id_col_to: unit_id
|
|
520
|
+
relationship: auto
|
|
521
|
+
string_exact_match_prune: none
|
|
522
|
+
evidence: false
|
|
523
|
+
reason: false
|
|
524
|
+
|
|
525
|
+
data:
|
|
526
|
+
mode: files
|
|
527
|
+
from_path: ../data/from_units.csv
|
|
528
|
+
to_path: ../data/to_units.csv
|
|
529
|
+
|
|
530
|
+
llm:
|
|
531
|
+
provider: gemini
|
|
532
|
+
model: gemini-3.1-flash-lite-preview
|
|
533
|
+
gemini_api_key_env: GEMINI_API_KEY
|
|
534
|
+
temperature: 0.75
|
|
535
|
+
seed: 42
|
|
536
|
+
enable_google_search: true
|
|
537
|
+
request_timeout_seconds: 90
|
|
538
|
+
|
|
539
|
+
pipeline:
|
|
540
|
+
batch_size: 5
|
|
541
|
+
max_candidates: 6
|
|
542
|
+
review_score_threshold: 0.6
|
|
543
|
+
|
|
544
|
+
cache:
|
|
545
|
+
enabled: true
|
|
546
|
+
backend: sqlite
|
|
547
|
+
path: llm_cache.sqlite
|
|
548
|
+
|
|
549
|
+
retry:
|
|
550
|
+
max_attempts: 6
|
|
551
|
+
base_delay_seconds: 1.0
|
|
552
|
+
max_delay_seconds: 20.0
|
|
553
|
+
jitter_seconds: 0.2
|
|
554
|
+
|
|
555
|
+
replay:
|
|
556
|
+
enabled: false
|
|
557
|
+
store_dir: .adminlineage_replay
|
|
558
|
+
|
|
559
|
+
output:
|
|
560
|
+
write_csv: true
|
|
561
|
+
write_parquet: true
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
## Outputs And Utilities
|
|
565
|
+
|
|
566
|
+
### Main Artifacts
|
|
567
|
+
|
|
568
|
+
| Artifact | Meaning |
|
|
569
|
+
|---|---|
|
|
570
|
+
| `evolution_key.csv` | Main crosswalk output. |
|
|
571
|
+
| `evolution_key.parquet` | Parquet version of the crosswalk output. |
|
|
572
|
+
| `review_queue.csv` | Rows that need manual review. |
|
|
573
|
+
| `run_metadata.json` | Run counts, warnings, request details, and artifact paths. |
|
|
574
|
+
| `links_raw.jsonl` | Incremental per-row decision log used for resumability and replay publishing. |
|
|
575
|
+
|
|
576
|
+
### Crosswalk Columns
|
|
577
|
+
|
|
578
|
+
| Column | Meaning |
|
|
579
|
+
|---|---|
|
|
580
|
+
| `from_name`, `to_name` | Raw source and target names. |
|
|
581
|
+
| `from_canonical_name`, `to_canonical_name` | Normalized names used during matching. |
|
|
582
|
+
| `from_id`, `to_id` | User IDs when supplied, otherwise fallback internal IDs. |
|
|
583
|
+
| `score` | Confidence in the chosen link, in `[0, 1]`. |
|
|
584
|
+
| `link_type` | One of `rename`, `split`, `merge`, `transfer`, `no_match`, `unknown`. |
|
|
585
|
+
| `relationship` | One of `father_to_father`, `father_to_child`, `child_to_father`, `child_to_child`, `unknown`. |
|
|
586
|
+
| `merge` | `both` for matched rows, `only_in_from` for source-only rows, `only_in_to` for target-only rows appended after the source pass. |
|
|
587
|
+
| `evidence` | Short grounded summary. Included only when `evidence=True`. |
|
|
588
|
+
| `reason` | Longer explanation. Present as a column, but empty unless `reason=True`. |
|
|
589
|
+
| exact-match columns | Copied context columns from the request, such as `state` or `district`. |
|
|
590
|
+
| `country`, `year_from`, `year_to` | Request metadata. |
|
|
591
|
+
| `run_id` | Deterministic run identifier. |
|
|
592
|
+
| `from_key`, `to_key` | Internal stable keys used by the pipeline. |
|
|
593
|
+
| `constraints_passed` | Constraint checks recorded for that row. |
|
|
594
|
+
| `review_flags`, `review_reason` | QA flags and their comma-joined summary. |
|
|
595
|
+
|
|
596
|
+
`review_queue.csv` is a filtered subset of the crosswalk for rows that were flagged for manual review. Target-only rows remain in the final evolution key with `merge="only_in_to"`.
|
|
597
|
+
|
|
598
|
+
## Operational Notes
|
|
599
|
+
|
|
600
|
+
- `exact_match` scopes the candidate search. If you set `exact_match=["state", "district"]`, a row only compares against rows from the same `(state, district)` group. This is the main hierarchical matching mechanism in the package.
|
|
601
|
+
- Candidate generation happens before Gemini. `max_candidates` controls how many shortlist entries the model sees for each source row. The default is 6, but you can still raise it explicitly.
|
|
602
|
+
- Exact string handling happens before the model call. `string_exact_match_prune` controls whether already matched rows remain in later AI work.
|
|
603
|
+
- Live Gemini work is grounded with Google Search and returns strict JSON. The pipeline then materializes CSV and Parquet outputs itself.
|
|
604
|
+
- When `string_exact_match_prune` is `from` or `to`, the package can run one bounded second-stage rescue pass on unmatched primary-side rows. That pass does one grounded research call, and only does a second shortlist decision call if the research returned a usable `lineage_hint`.
|
|
605
|
+
- Replay is opt-in. When `replay_enabled=True`, rerunning the same semantic request reuses the prior completed LLM output instead of calling Gemini again.
|
|
606
|
+
- `seed` helps keep request identity deterministic and makes runs easier to reproduce.
|
|
607
|
+
- Cache is configured in CLI config. When enabled, the package uses a SQLite cache at `cache.path`.
|
|
608
|
+
- Retry behavior is configurable in CLI config. Transient Gemini failures are retried according to the `retry` section before a row is marked unresolved.
|
|
609
|
+
- `export_crosswalk` and `adminlineage export` convert an existing materialized crosswalk into `csv`, `parquet`, or `jsonl`.
|
|
610
|
+
|
|
611
|
+
## A Few Practical Defaults
|
|
612
|
+
|
|
613
|
+
- `model="gemini-3.1-flash-lite-preview"`
|
|
614
|
+
- `temperature=0.75`
|
|
615
|
+
- `enable_google_search=True`
|
|
616
|
+
- `evidence=False`
|
|
617
|
+
- `reason=False`
|
|
618
|
+
- `relationship="auto"`
|
|
619
|
+
- `string_exact_match_prune="none"`
|
|
620
|
+
|
|
621
|
+
Those are the current defaults. Change them when you need replay, evidence, stricter scoping, or different review thresholds.
|
|
622
|
+
|
|
623
|
+
## Citation
|
|
624
|
+
|
|
625
|
+
If you use AdminLineageAI in published work, please cite:
|
|
626
|
+
|
|
627
|
+
Siddiqui, T. I., and Vetharenian H. Tariq A.
|