adminlineage 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. adminlineage-0.2.0/LICENSE +21 -0
  2. adminlineage-0.2.0/PKG-INFO +627 -0
  3. adminlineage-0.2.0/README.md +590 -0
  4. adminlineage-0.2.0/pyproject.toml +68 -0
  5. adminlineage-0.2.0/setup.cfg +4 -0
  6. adminlineage-0.2.0/src/adminlineage/__init__.py +15 -0
  7. adminlineage-0.2.0/src/adminlineage/__main__.py +6 -0
  8. adminlineage-0.2.0/src/adminlineage/api.py +150 -0
  9. adminlineage-0.2.0/src/adminlineage/candidates.py +131 -0
  10. adminlineage-0.2.0/src/adminlineage/cli.py +169 -0
  11. adminlineage-0.2.0/src/adminlineage/config.py +129 -0
  12. adminlineage-0.2.0/src/adminlineage/export.py +44 -0
  13. adminlineage-0.2.0/src/adminlineage/io.py +154 -0
  14. adminlineage-0.2.0/src/adminlineage/llm/__init__.py +21 -0
  15. adminlineage-0.2.0/src/adminlineage/llm/base.py +50 -0
  16. adminlineage-0.2.0/src/adminlineage/llm/cache.py +108 -0
  17. adminlineage-0.2.0/src/adminlineage/llm/gemini.py +692 -0
  18. adminlineage-0.2.0/src/adminlineage/llm/mock.py +111 -0
  19. adminlineage-0.2.0/src/adminlineage/llm/retry.py +32 -0
  20. adminlineage-0.2.0/src/adminlineage/logging_utils.py +36 -0
  21. adminlineage-0.2.0/src/adminlineage/models.py +249 -0
  22. adminlineage-0.2.0/src/adminlineage/normalize.py +80 -0
  23. adminlineage-0.2.0/src/adminlineage/pipeline.py +1261 -0
  24. adminlineage-0.2.0/src/adminlineage/pipeline_adjudication.py +534 -0
  25. adminlineage-0.2.0/src/adminlineage/pipeline_materialization.py +210 -0
  26. adminlineage-0.2.0/src/adminlineage/pipeline_second_stage.py +716 -0
  27. adminlineage-0.2.0/src/adminlineage/prompts.py +211 -0
  28. adminlineage-0.2.0/src/adminlineage/replay.py +230 -0
  29. adminlineage-0.2.0/src/adminlineage/review.py +107 -0
  30. adminlineage-0.2.0/src/adminlineage/schema.py +142 -0
  31. adminlineage-0.2.0/src/adminlineage/utils.py +104 -0
  32. adminlineage-0.2.0/src/adminlineage/validation.py +215 -0
  33. adminlineage-0.2.0/src/adminlineage.egg-info/PKG-INFO +627 -0
  34. adminlineage-0.2.0/src/adminlineage.egg-info/SOURCES.txt +36 -0
  35. adminlineage-0.2.0/src/adminlineage.egg-info/dependency_links.txt +1 -0
  36. adminlineage-0.2.0/src/adminlineage.egg-info/entry_points.txt +2 -0
  37. adminlineage-0.2.0/src/adminlineage.egg-info/requires.txt +18 -0
  38. adminlineage-0.2.0/src/adminlineage.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Taha Ibrahim Siddiqui
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,627 @@
1
+ Metadata-Version: 2.4
2
+ Name: adminlineage
3
+ Version: 0.2.0
4
+ Summary: Build administrative evolution keys across time with exact-match constrained Gemini adjudication
5
+ Author: AdminLineage Contributors
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/TahaIbrahimSiddiqui/AdminLineageAI
8
+ Project-URL: Repository, https://github.com/TahaIbrahimSiddiqui/AdminLineageAI
9
+ Project-URL: Issues, https://github.com/TahaIbrahimSiddiqui/AdminLineageAI/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pandas>=2.0
21
+ Requires-Dist: pydantic>=2.7
22
+ Requires-Dist: PyYAML>=6.0
23
+ Requires-Dist: python-dotenv>=1.0
24
+ Requires-Dist: google-genai>=0.7
25
+ Provides-Extra: io
26
+ Requires-Dist: pyarrow>=15.0; extra == "io"
27
+ Provides-Extra: dev
28
+ Requires-Dist: build>=1.2; extra == "dev"
29
+ Requires-Dist: mypy>=1.10; extra == "dev"
30
+ Requires-Dist: pandas-stubs>=2.2; extra == "dev"
31
+ Requires-Dist: pytest>=8.0; extra == "dev"
32
+ Requires-Dist: ruff>=0.5; extra == "dev"
33
+ Requires-Dist: twine>=5.1; extra == "dev"
34
+ Requires-Dist: types-PyYAML>=6.0; extra == "dev"
35
+ Requires-Dist: vulture>=2.16; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # AdminLineageAI
39
+
40
+ AdminLineageAI makes crosswalks between administrative locations such as districts (ADM2), subdistricts (ADM3), states (ADM1), and countries (ADM0) across datasets that may come from completely different sources and different periods. It uses AI to compare likely matches, reason over spelling variants and language-specific forms, administrative split/merges/renames and produce a usable crosswalk plus review artifacts.
41
+
42
+ Matching administrative units by hand is labour-intensive work. Through this package, we hope to reduce the manual work of matching administrative units between datasets while still keeping a clear review trail and reproducibility.
43
+
44
+ The package generates candidate matches between two datasets, asks Gemini to choose among them, and writes a crosswalk plus review artifacts. It outputs a final evolution key plus review files as CSV and Parquet.
45
+
46
+ <p align="center">
47
+ <img alt="This is an experimental utility. Treat these crosswalks as assistive outputs and cross-verify them, especially in important cases." src="https://img.shields.io/static/v1?label=This%20is%20an%20experimental%20utility.&message=Treat%20these%20crosswalks%20as%20assistive%20outputs%20and%20cross-verify%20them%2C%20especially%20in%20important%20cases.&color=red">
48
+ </p>
49
+
50
+ ## Possible use cases
51
+
52
+ Below are few possible scenarios where this package can be of assistance. Moreover, we would love to hear about other user experiences and use cases for this package.
53
+
54
+ - For instance, one has scheme dataset from a government scheme and need to match it against a standard administrative list such as a census table. The scheme source may write `Paschimi Singhbhum` while another uses `West Singhbhum`. Plain fuzzy matching will miss cases like this unless you manually standardize prefixes and suffixes first. While AI can do matching for this because it has context that `paschim` in Hindi means `west`. The same kind of issue shows up across many widely spoken languages.
55
+ - Handling administrative churn. Districts and other units are regularly split, merged, renamed, or grouped differently, and there is often no up-to-date public evolution list for newly created units, the package does a wide google search and find possible predessor or sucessor for each administrative unit in the primary dataset
56
+ - Creating entirely new evolution crosswalks that do not exist between two time period at an administrative level.
57
+
58
+ ## Important Features
59
+
60
+ - The default setting of the package is set to have best results with minimal token cost. Please feel free to change them according to your needs.
61
+ - To keep the token costs minimal, we do exact string match plus pruning of matching candidates on the primary side before first stage.
62
+ - Hierarchical matching with `exact_match`. If your data are nested, you can match names within exact scopes such as `country`, `state`, or `district`. For example, you can choose to match only district names within a states or subdistricts with a district. This works well, but the exact-match column string need to line up exactly across both datasets.
63
+ - Replay and reproducibility. Academic pipelines often need to be rerun many times. With replay enabled, repeated semantic requests can reuse prior completed LLM work instead of calling the API again. The `seed` parameter helps keep request identity deterministic and makes reruns easier to reproduce.
64
+
65
+ The supported live workflow in AdminLineageAI is:
66
+
67
+ - Compatible with any `gemini-3+` model
68
+ - Google Search grounding enabled
69
+ - strict JSON output from the model
70
+ - user-controlled batching with automatic split fallback on failed multi-row requests
71
+ - an optional bounded second-stage rescue pass for unmatched rows when `string_exact_match_prune`
72
+ is set to `from` or `to`
73
+
74
+ The bounded second stage works like this:
75
+
76
+ - first pass still does the normal grounded shortlist adjudication
77
+ - if `string_exact_match_prune="from"`, the rescue pass revisits rows with `merge="only_in_from"`
78
+ - if `string_exact_match_prune="to"`, it revisits rows with `merge="only_in_to"`
79
+ - it runs one grounded research call to look for a predecessor or successor name
80
+ - if that research comes back as `unknown` with no lineage hint, the row is left alone and the
81
+ rescue pass stops there
82
+ - otherwise it searches the full opposite table, rebuilds a short global shortlist, and runs one
83
+ final strict JSON decision call without additional search grounding
84
+ - the second stage is sequential, one-pass, resumable, and writes `second_stage_results.jsonl`
85
+
86
+ ## How To Use
87
+
88
+ You do not need the CLI to use AdminLineageAI. The simplest path is the Python API.
89
+
90
+ 1. Install the published package.
91
+
92
+ ```bash
93
+ pip install adminlineage
94
+ ```
95
+
96
+ Install the optional parquet dependency if you want parquet output support:
97
+
98
+ ```bash
99
+ pip install "adminlineage[io]"
100
+ ```
101
+
102
+ 2. Set a Gemini API key in `GEMINI_API_KEY`, or use another environment variable name and pass it explicitly.
103
+
104
+ ```bash
105
+ GEMINI_API_KEY=your_api_key_here
106
+ ```
107
+
108
+ The package can load a nearby `.env` file when it looks for the key.
109
+
110
+ 3. Choose the name column on each side, and add optional exact-match columns, IDs, or extra context columns if you have them.
111
+
112
+ 5. Run the matcher.
113
+
114
+ ```python
115
+ import pandas as pd
116
+ import adminlineage
117
+
118
+ df_from = pd.read_csv("from_units.csv")
119
+ df_to = pd.read_csv("to_units.csv")
120
+
121
+ crosswalk_df, metadata = adminlineage.build_evolution_key(
122
+ df_from,
123
+ df_to,
124
+ country="India",
125
+ year_from=1951,
126
+ year_to=2001,
127
+ map_col_from="district",
128
+ map_col_to="district",
129
+ exact_match=["state"],
130
+ id_col_from="unit_id",
131
+ id_col_to="unit_id",
132
+ relationship="auto",
133
+ string_exact_match_prune="from",
134
+ evidence=False,
135
+ reason=False,
136
+ model="gemini-3.1-flash-lite-preview",
137
+ gemini_api_key_env="GEMINI_API_KEY",
138
+ replay_enabled=True,
139
+ seed=42,
140
+ )
141
+
142
+ print(crosswalk_df[["from_name", "to_name", "merge", "score"]].head())
143
+ print(metadata["artifacts"])
144
+ ```
145
+
146
+ 6. Review the outputs. By default, AdminLineageAI writes artifacts under `outputs/<country>_<year_from>_<year_to>_<map_col_from>`. The main ones are `evolution_key.csv`, `review_queue.csv`, and `run_metadata.json`.
147
+
148
+ ## Common Options
149
+
150
+ - `exact_match`: Restricts matching to rows that agree exactly on one or more scope columns such as `country`, `state`, or `district`.
151
+ - `string_exact_match_prune`: Controls how aggressively exact string hits are removed from later AI work. Use this to control token spend.
152
+ - `relationship`: Declares the kind of relationship you expect, or leave it as `auto`.
153
+ - `max_candidates`: Limits how many candidate rows are shown to the model for each source row. The default is 6.
154
+ - `evidence`: Adds a short factual summary column.
155
+ - `reason`: Adds a longer explanation column.
156
+ - `replay_enabled`: Reuses prior completed LLM work when the semantic request matches.
157
+ - `seed`: Keeps request identity deterministic for more reproducible reruns.
158
+ - `output_dir`: Changes where run artifacts are written.
159
+
160
+ ## Matching Flow Example
161
+
162
+ This example follows a nested district-level match inside `India > Uttar Pradesh` from `2011` to `2025`. Here `string_exact_match_prune='to'` (this set `to` as primary side and `from` as secondary side where all candidates stay global).
163
+
164
+ ```mermaid
165
+ flowchart TD
166
+ A["From table (2011)<br/>India / Uttar Pradesh / Agra<br/>India / Uttar Pradesh / Kanpur Dehat<br/>India / Uttar Pradesh / Faizabad<br/>India / Uttar Pradesh / Allahabad"]
167
+ B["To table (2025)<br/>India / Uttar Pradesh / Agra<br/>India / Uttar Pradesh / Kanpur Rural<br/>India / Uttar Pradesh / Ayodhya<br/>India / Uttar Pradesh / Prayagraj"]
168
+ C["Nested settings<br/>map_col='district'<br/>exact_match=['state']<br/>string_exact_match_prune='to'<br/>this set 'to' as primary side<br/>and 'from' as secondary side<br/>where all candidates stay global"]
169
+ D["Validate inputs and normalize names"]
170
+ E["Exact string match pruning before LLM"]
171
+ F["Agra -> Agra<br/>no LLM used here<br/>just exact string match"]
172
+ H["AI matches remaining rows on primary side<br/>(Kanpur Rural, Ayodhya, Prayagraj)<br/>using grounded Gemini search<br/>"]
173
+ I["AI matches Kanpur Dehat -> Kanpur Rural<br/>because it has context that 'dehat' means 'rural' in Hindi"]
174
+ J{"Do Ayodhya or Prayagraj stay unmatched<br/>after first stage?"}
175
+ L["Do intensive Gemini search of potential predecessor / successor of Ayodhya / Prayagraj<br/>if they were renamed, merged, split, or transferred"]
176
+ M["If Gemini finds a potential predecessor / successor for that district<br/>match it with the global district list from the secondary side"]
177
+ N["Write final evolution key<br/>Agra -> Agra<br/>Kanpur Dehat -> Kanpur Rural<br/>Faizabad -> Ayodhya<br/>Allahabad -> Prayagraj"]
178
+ O["Write artifacts<br/>evolution_key.csv<br/>review_queue.csv<br/>run_metadata.json<br/>replay bundle"]
179
+
180
+ subgraph G["First stage"]
181
+ H
182
+ I
183
+ end
184
+
185
+ subgraph P["Second stage"]
186
+ L
187
+ M
188
+ end
189
+
190
+ A --> C
191
+ B --> C
192
+ C --> D
193
+ D --> E
194
+ E --> F
195
+ E --> H
196
+ H --> I
197
+ I --> J
198
+ J -- "No" --> N
199
+ J -- "Yes" --> L
200
+ L --> M
201
+ A --> N
202
+ B --> N
203
+ M --> N
204
+ N --> O
205
+ ```
206
+
207
+ ## Hand Check Against Scheme Ground Truth
208
+
209
+ This is a quick hand check against a human-made evolution key for a government scheme implemented nationally in India. The scheme side is `2025` districts, mapped back to their predecessor `2011` districts.
210
+
211
+ The comparison is oriented from the scheme side: for each `district_2025` in the hand key, does the evolution key recover the expected `district_2011` predecessor? Names were normalized before comparison. Spelling and transliteration-only differences were treated as aligns. A row counts as a match only when the evolution key has a non-blank `from_name`.
212
+
213
+ - `aligns` means the evolution key points to the same 2011 district name
214
+ - `disagrees` means the evolution key points to a different 2011 district
215
+ - `no match` means the evolution key does not provide any non-blank `from_name`
216
+
217
+ | Outcome | Count | Share of 612 hand-coded district pairs |
218
+ |---|---:|---:|
219
+ | Aligns with scheme hand mapping | 595 | 97.22% |
220
+ | Disagrees with scheme hand mapping | 11 | 1.80% |
221
+ | Evolution key provides no 2011 match | 6 | 0.98% |
222
+
223
+ Takeaway: most scheme districts map back to the same 2011 predecessor as the hand key, a few disagree, and a small number have no match. Treat this as a sanity check, not a full audit.
224
+
225
+ ## Optional CLI Workflow
226
+
227
+ The CLI is useful when you want a saved YAML config for repeatable runs, but it is optional.
228
+
229
+ ```bash
230
+ adminlineage preview --config examples/config/example.yml
231
+ adminlineage validate --config examples/config/example.yml
232
+ adminlineage run --config examples/config/example.yml
233
+ adminlineage export --input outputs/india_1951_2001_subdistrict/evolution_key.csv --format jsonl
234
+ ```
235
+
236
+ The package includes these example assets:
237
+
238
+ - `examples/config/example.yml`
239
+ - `examples/loaders/sample_loader.py`
240
+ - `examples/adminlineage_gemini_3_1_flash_lite.ipynb`
241
+
242
+ ## Python API
243
+
244
+ Public objects available from `import adminlineage`:
245
+
246
+ - `build_evolution_key`
247
+ - `preview_plan`
248
+ - `validate_inputs`
249
+ - `export_crosswalk`
250
+ - `get_output_schema_definition`
251
+ - `OUTPUT_SCHEMA_VERSION`
252
+ - `__version__`
253
+
254
+ ### `build_evolution_key`
255
+
256
+ Build the evolution key and write run artifacts.
257
+
258
+ Required arguments:
259
+
260
+ | Argument | Type | Meaning |
261
+ |---|---|---|
262
+ | `df_from` | `pd.DataFrame` | Earlier-period table |
263
+ | `df_to` | `pd.DataFrame` | Later-period table |
264
+ | `country` | `str` | Country label used in prompts and metadata |
265
+ | `year_from` | `int \| str` | Earlier-period label |
266
+ | `year_to` | `int \| str` | Later-period label |
267
+ | `map_col_from` | `str` | Source name column |
268
+
269
+ Optional arguments:
270
+
271
+ | Argument | Type | Default | Meaning |
272
+ |---|---|---|---|
273
+ | `map_col_to` | `str \| None` | `None` | Target name column. Falls back to `map_col_from` when omitted. |
274
+ | `exact_match` | `list[str] \| None` | `None` | Columns that must agree before comparison. |
275
+ | `id_col_from` | `str \| None` | `None` | Source ID column. |
276
+ | `id_col_to` | `str \| None` | `None` | Target ID column. |
277
+ | `extra_context_cols` | `list[str] \| None` | `None` | Extra columns added to the model payload. |
278
+ | `relationship` | `str` | `auto` | One of `auto`, `father_to_father`, `father_to_child`, `child_to_father`, `child_to_child`. |
279
+ | `string_exact_match_prune` | `str` | `none` | `none` keeps exact-string hits in later AI work, `from` removes matched source rows from AI work, `to` removes matched source and target rows from later AI work. |
280
+ | `evidence` | `bool` | `False` | Adds a short evidence summary and includes the `evidence` column. |
281
+ | `reason` | `bool` | `False` | Adds a longer explanation in the `reason` column. |
282
+ | `model` | `str` | `gemini-3.1-flash-lite-preview` | Gemini model name. |
283
+ | `gemini_api_key_env` | `str` | `GEMINI_API_KEY` | Environment variable name used for the API key. |
284
+ | `batch_size` | `int` | `5` | Maximum number of source rows per Gemini request. When a multi-row request fails, the pipeline retries in smaller batches. |
285
+ | `max_candidates` | `int` | `6` | Candidate shortlist size per source row. |
286
+ | `output_dir` | `str \| Path` | `outputs` | Base output directory for run artifacts. |
287
+ | `seed` | `int` | `42` | Deterministic seed for repeatable request identity. |
288
+ | `temperature` | `float` | `0.75` | Gemini temperature. |
289
+ | `enable_google_search` | `bool` | `True` | Enables grounded Gemini adjudication. |
290
+ | `request_timeout_seconds` | `int \| None` | `90` | Per-request timeout. |
291
+ | `env_search_dir` | `str \| Path \| None` | `None` | Starting directory used when searching for `.env`. |
292
+ | `replay_enabled` | `bool` | `False` | Reuses prior completed LLM work when the semantic request matches. |
293
+ | `replay_store_dir` | `str \| Path \| None` | `None` | Replay store path. Falls back to `.adminlineage_replay` internally when replay is enabled. |
294
+
295
+ Return value:
296
+
297
+ - `tuple[pd.DataFrame, dict]`
298
+ - first item: the crosswalk DataFrame
299
+ - second item: run metadata with counts, warnings, request details, and artifact paths
300
+
301
+ ### `preview_plan`
302
+
303
+ Preview grouping and candidate-generation behavior without calling Gemini.
304
+
305
+ ```python
306
+ adminlineage.preview_plan(
307
+ df_from,
308
+ df_to,
309
+ *,
310
+ country,
311
+ year_from,
312
+ year_to,
313
+ map_col_from,
314
+ map_col_to=None,
315
+ exact_match=None,
316
+ id_col_from=None,
317
+ id_col_to=None,
318
+ extra_context_cols=None,
319
+ string_exact_match_prune="none",
320
+ max_candidates=6,
321
+ )
322
+ ```
323
+
324
+ Return value: a diagnostics dict describing validity, group sizes, exact-string hits, and candidate budgets.
325
+
326
+ ### `validate_inputs`
327
+
328
+ Validate the two input tables without running the pipeline.
329
+
330
+ ```python
331
+ adminlineage.validate_inputs(
332
+ df_from,
333
+ df_to,
334
+ *,
335
+ country,
336
+ map_col_from,
337
+ map_col_to=None,
338
+ exact_match=None,
339
+ id_col_from=None,
340
+ id_col_to=None,
341
+ )
342
+ ```
343
+
344
+ Return value: a diagnostics dict that reports whether the inputs are valid and what is missing or duplicated.
345
+
346
+ ### `export_crosswalk`
347
+
348
+ Convert a materialized crosswalk file into another format.
349
+
350
+ ```python
351
+ adminlineage.export_crosswalk(
352
+ input_path="outputs/india_1951_2001_subdistrict/evolution_key.csv",
353
+ output_format="jsonl",
354
+ output_path=None,
355
+ )
356
+ ```
357
+
358
+ Return value: the written output path.
359
+
360
+ Supported output formats:
361
+
362
+ - `csv`
363
+ - `parquet`
364
+ - `jsonl`
365
+
366
+ ### `get_output_schema_definition`
367
+
368
+ Return a machine-readable description of the materialized output schema.
369
+
370
+ ```python
371
+ schema = adminlineage.get_output_schema_definition(include_evidence=False)
372
+ ```
373
+
374
+ Arguments:
375
+
376
+ | Argument | Type | Default | Meaning |
377
+ |---|---|---|---|
378
+ | `include_evidence` | `bool` | `False` | Includes the `evidence` column in the returned schema definition. |
379
+
380
+ Return value: a dict containing the schema version, ordered output columns, required columns, and enum values, including the `merge` indicator enum.
381
+
382
+ ### `OUTPUT_SCHEMA_VERSION`
383
+
384
+ String constant for the current materialized output schema version.
385
+
386
+ ### `__version__`
387
+
388
+ String constant for the package version.
389
+
390
+ ## Optional CLI Reference
391
+
392
+ Commands:
393
+
394
+ ```bash
395
+ adminlineage run --config path/to/config.yml
396
+ adminlineage preview --config path/to/config.yml
397
+ adminlineage validate --config path/to/config.yml
398
+ adminlineage export --input path/to/evolution_key.csv --format {csv|parquet|jsonl} [--output path]
399
+ ```
400
+
401
+ `preview` and `validate` do not call Gemini. `run` writes the full artifact set. `export` converts an existing materialized crosswalk file. If you are using the Python API directly, you can ignore this section.
402
+
403
+ ## CLI YAML Config Reference
404
+
405
+ Top-level sections:
406
+
407
+ - `request`
408
+ - `data`
409
+ - `llm`
410
+ - `pipeline`
411
+ - `cache`
412
+ - `retry`
413
+ - `replay`
414
+ - `output`
415
+
416
+ ### `request`
417
+
418
+ | Key | Default | Meaning |
419
+ |---|---|---|
420
+ | `country` | required | Country label used in prompts and metadata. |
421
+ | `year_from` | required | Earlier-period label. |
422
+ | `year_to` | required | Later-period label. |
423
+ | `map_col_from` | required | Source name column. |
424
+ | `map_col_to` | `null` | Target name column. Falls back to `map_col_from`. |
425
+ | `exact_match` | `[]` | Columns that must agree before comparison. |
426
+ | `id_col_from` | `null` | Source ID column. |
427
+ | `id_col_to` | `null` | Target ID column. |
428
+ | `extra_context_cols` | `[]` | Extra columns added to the model payload. |
429
+ | `relationship` | `auto` | Relationship mode. |
430
+ | `string_exact_match_prune` | `none` | Exact-string pruning mode. |
431
+ | `evidence` | `false` | Adds the `evidence` column. |
432
+ | `reason` | `false` | Adds the `reason` column. |
433
+
434
+ ### `data`
435
+
436
+ | Key | Default | Meaning |
437
+ |---|---|---|
438
+ | `mode` | `files` | One of `files` or `python_hook`. |
439
+ | `from_path` | `null` | Required when `mode: files`. |
440
+ | `to_path` | `null` | Required when `mode: files`. |
441
+ | `callable` | `null` | Required when `mode: python_hook`. Uses `module:function` syntax. |
442
+ | `params` | `{}` | Arbitrary config payload passed to the loader hook. |
443
+
444
+ Loader contract for `python_hook` mode:
445
+
446
+ ```python
447
+ def load_data(config: dict) -> tuple[pd.DataFrame, pd.DataFrame]:
448
+ ...
449
+ ```
450
+
451
+ The included example hook is `examples/loaders/sample_loader.py`.
452
+
453
+ For file mode, `data.from_path` and `data.to_path` are resolved relative to the config file location, not your shell location.
454
+
455
+ ### `llm`
456
+
457
+ | Key | Default | Meaning |
458
+ |---|---|---|
459
+ | `provider` | `gemini` | Use `gemini` for live runs or `mock` for dry runs and testing. |
460
+ | `model` | `gemini-3.1-flash-lite-preview` | Gemini model name. |
461
+ | `gemini_api_key_env` | `GEMINI_API_KEY` | Environment variable name for the API key. |
462
+ | `temperature` | `0.75` | Gemini temperature. |
463
+ | `seed` | `42` | Deterministic seed. |
464
+ | `enable_google_search` | `true` | Enables grounded adjudication. |
465
+ | `request_timeout_seconds` | `90` | Per-request timeout. |
466
+
467
+ ### `pipeline`
468
+
469
+ | Key | Default | Meaning |
470
+ |---|---|---|
471
+ | `batch_size` | `5` | Maximum number of source rows per Gemini request. Failed multi-row requests are retried in smaller batches. |
472
+ | `max_candidates` | `6` | Candidate shortlist size per source row. You can raise this if you want a wider shortlist. |
473
+ | `review_score_threshold` | `0.6` | Rows below this score are flagged for review. |
474
+
475
+ ### `cache`
476
+
477
+ | Key | Default | Meaning |
478
+ |---|---|---|
479
+ | `enabled` | `true` | Enables the SQLite LLM cache. |
480
+ | `backend` | `sqlite` | Current cache backend. |
481
+ | `path` | `llm_cache.sqlite` | Cache database path. |
482
+
483
+ ### `retry`
484
+
485
+ | Key | Default | Meaning |
486
+ |---|---|---|
487
+ | `max_attempts` | `6` | Maximum retry attempts for transient LLM failures. |
488
+ | `base_delay_seconds` | `1.0` | Initial retry delay. |
489
+ | `max_delay_seconds` | `20.0` | Maximum retry delay. |
490
+ | `jitter_seconds` | `0.2` | Random jitter added to retry timing. |
491
+
492
+ ### `replay`
493
+
494
+ | Key | Default | Meaning |
495
+ |---|---|---|
496
+ | `enabled` | `false` | Enables exact replay for fully completed runs. |
497
+ | `store_dir` | `.adminlineage_replay` | Replay bundle directory. |
498
+
499
+ Relative replay store paths are resolved from the config file location. This section only matters if you are using the CLI workflow.
500
+
501
+ ### `output`
502
+
503
+ | Key | Default | Meaning |
504
+ |---|---|---|
505
+ | `write_csv` | `true` | Writes `evolution_key.csv`. |
506
+ | `write_parquet` | `true` | Writes `evolution_key.parquet`. |
507
+
508
+ Minimal config shape:
509
+
510
+ ```yaml
511
+ request:
512
+ country: India
513
+ year_from: 1951
514
+ year_to: 2001
515
+ map_col_from: subdistrict
516
+ map_col_to: subdistrict
517
+ exact_match: [state, district]
518
+ id_col_from: unit_id
519
+ id_col_to: unit_id
520
+ relationship: auto
521
+ string_exact_match_prune: none
522
+ evidence: false
523
+ reason: false
524
+
525
+ data:
526
+ mode: files
527
+ from_path: ../data/from_units.csv
528
+ to_path: ../data/to_units.csv
529
+
530
+ llm:
531
+ provider: gemini
532
+ model: gemini-3.1-flash-lite-preview
533
+ gemini_api_key_env: GEMINI_API_KEY
534
+ temperature: 0.75
535
+ seed: 42
536
+ enable_google_search: true
537
+ request_timeout_seconds: 90
538
+
539
+ pipeline:
540
+ batch_size: 5
541
+ max_candidates: 6
542
+ review_score_threshold: 0.6
543
+
544
+ cache:
545
+ enabled: true
546
+ backend: sqlite
547
+ path: llm_cache.sqlite
548
+
549
+ retry:
550
+ max_attempts: 6
551
+ base_delay_seconds: 1.0
552
+ max_delay_seconds: 20.0
553
+ jitter_seconds: 0.2
554
+
555
+ replay:
556
+ enabled: false
557
+ store_dir: .adminlineage_replay
558
+
559
+ output:
560
+ write_csv: true
561
+ write_parquet: true
562
+ ```
563
+
564
+ ## Outputs And Utilities
565
+
566
+ ### Main Artifacts
567
+
568
+ | Artifact | Meaning |
569
+ |---|---|
570
+ | `evolution_key.csv` | Main crosswalk output. |
571
+ | `evolution_key.parquet` | Parquet version of the crosswalk output. |
572
+ | `review_queue.csv` | Rows that need manual review. |
573
+ | `run_metadata.json` | Run counts, warnings, request details, and artifact paths. |
574
+ | `links_raw.jsonl` | Incremental per-row decision log used for resumability and replay publishing. |
575
+
576
+ ### Crosswalk Columns
577
+
578
+ | Column | Meaning |
579
+ |---|---|
580
+ | `from_name`, `to_name` | Raw source and target names. |
581
+ | `from_canonical_name`, `to_canonical_name` | Normalized names used during matching. |
582
+ | `from_id`, `to_id` | User IDs when supplied, otherwise fallback internal IDs. |
583
+ | `score` | Confidence in the chosen link, in `[0, 1]`. |
584
+ | `link_type` | One of `rename`, `split`, `merge`, `transfer`, `no_match`, `unknown`. |
585
+ | `relationship` | One of `father_to_father`, `father_to_child`, `child_to_father`, `child_to_child`, `unknown`. |
586
+ | `merge` | `both` for matched rows, `only_in_from` for source-only rows, `only_in_to` for target-only rows appended after the source pass. |
587
+ | `evidence` | Short grounded summary. Included only when `evidence=True`. |
588
+ | `reason` | Longer explanation. Present as a column, but empty unless `reason=True`. |
589
+ | exact-match columns | Copied context columns from the request, such as `state` or `district`. |
590
+ | `country`, `year_from`, `year_to` | Request metadata. |
591
+ | `run_id` | Deterministic run identifier. |
592
+ | `from_key`, `to_key` | Internal stable keys used by the pipeline. |
593
+ | `constraints_passed` | Constraint checks recorded for that row. |
594
+ | `review_flags`, `review_reason` | QA flags and their comma-joined summary. |
595
+
596
+ `review_queue.csv` is a filtered subset of the crosswalk for rows that were flagged for manual review. Target-only rows remain in the final evolution key with `merge="only_in_to"`.
597
+
598
+ ## Operational Notes
599
+
600
+ - `exact_match` scopes the candidate search. If you set `exact_match=["state", "district"]`, a row only compares against rows from the same `(state, district)` group. This is the main hierarchical matching mechanism in the package.
601
+ - Candidate generation happens before Gemini. `max_candidates` controls how many shortlist entries the model sees for each source row. The default is 6, but you can still raise it explicitly.
602
+ - Exact string handling happens before the model call. `string_exact_match_prune` controls whether already matched rows remain in later AI work.
603
+ - Live Gemini work is grounded with Google Search and returns strict JSON. The pipeline then materializes CSV and Parquet outputs itself.
604
+ - When `string_exact_match_prune` is `from` or `to`, the package can run one bounded second-stage rescue pass on unmatched primary-side rows. That pass does one grounded research call, and only does a second shortlist decision call if the research returned a usable `lineage_hint`.
605
+ - Replay is opt-in. When `replay_enabled=True`, rerunning the same semantic request reuses the prior completed LLM output instead of calling Gemini again.
606
+ - `seed` helps keep request identity deterministic and makes runs easier to reproduce.
607
+ - Cache is configured in CLI config. When enabled, the package uses a SQLite cache at `cache.path`.
608
+ - Retry behavior is configurable in CLI config. Transient Gemini failures are retried according to the `retry` section before a row is marked unresolved.
609
+ - `export_crosswalk` and `adminlineage export` convert an existing materialized crosswalk into `csv`, `parquet`, or `jsonl`.
610
+
611
+ ## A Few Practical Defaults
612
+
613
+ - `model="gemini-3.1-flash-lite-preview"`
614
+ - `temperature=0.75`
615
+ - `enable_google_search=True`
616
+ - `evidence=False`
617
+ - `reason=False`
618
+ - `relationship="auto"`
619
+ - `string_exact_match_prune="none"`
620
+
621
+ Those are the current defaults. Change them when you need replay, evidence, stricter scoping, or different review thresholds.
622
+
623
+ ## Citation
624
+
625
+ If you use AdminLineageAI in published work, please cite:
626
+
627
+ Siddiqui, T. I., and Vetharenian H. Tariq A.