commoner-probe 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. commoner_probe-0.4.0/CHANGELOG.md +118 -0
  2. commoner_probe-0.4.0/LICENSE +21 -0
  3. commoner_probe-0.4.0/MANIFEST.in +13 -0
  4. commoner_probe-0.4.0/PKG-INFO +531 -0
  5. commoner_probe-0.4.0/README.md +493 -0
  6. commoner_probe-0.4.0/commoner_probe/__init__.py +62 -0
  7. commoner_probe-0.4.0/commoner_probe/__main__.py +5 -0
  8. commoner_probe-0.4.0/commoner_probe/answers.py +598 -0
  9. commoner_probe-0.4.0/commoner_probe/atr_linkage.py +275 -0
  10. commoner_probe-0.4.0/commoner_probe/base.py +169 -0
  11. commoner_probe-0.4.0/commoner_probe/cli.py +466 -0
  12. commoner_probe-0.4.0/commoner_probe/committees.py +603 -0
  13. commoner_probe-0.4.0/commoner_probe/corpus.py +312 -0
  14. commoner_probe-0.4.0/commoner_probe/csr/__init__.py +6 -0
  15. commoner_probe-0.4.0/commoner_probe/csr/mca.py +178 -0
  16. commoner_probe-0.4.0/commoner_probe/dmft/__init__.py +3 -0
  17. commoner_probe-0.4.0/commoner_probe/dmft/mines.py +238 -0
  18. commoner_probe-0.4.0/commoner_probe/entities.py +440 -0
  19. commoner_probe-0.4.0/commoner_probe/evidence.py +250 -0
  20. commoner_probe-0.4.0/commoner_probe/example_topics/__init__.py +23 -0
  21. commoner_probe-0.4.0/commoner_probe/example_topics/affirmative_action.json +82 -0
  22. commoner_probe-0.4.0/commoner_probe/example_topics/home_affairs_starred.json +31 -0
  23. commoner_probe-0.4.0/commoner_probe/example_topics/libraries.json +66 -0
  24. commoner_probe-0.4.0/commoner_probe/example_topics/mines_dmft_pmkkky.json +26 -0
  25. commoner_probe-0.4.0/commoner_probe/example_topics/narcotics_substance.json +44 -0
  26. commoner_probe-0.4.0/commoner_probe/http_client.py +206 -0
  27. commoner_probe-0.4.0/commoner_probe/members.py +127 -0
  28. commoner_probe-0.4.0/commoner_probe/neva.py +663 -0
  29. commoner_probe-0.4.0/commoner_probe/records.py +350 -0
  30. commoner_probe-0.4.0/commoner_probe/resolver.py +169 -0
  31. commoner_probe-0.4.0/commoner_probe/runlog.py +189 -0
  32. commoner_probe-0.4.0/commoner_probe/sansad.py +469 -0
  33. commoner_probe-0.4.0/commoner_probe/schemas/__init__.py +60 -0
  34. commoner_probe-0.4.0/commoner_probe/schemas/answers_atr_response.schema.json +22 -0
  35. commoner_probe-0.4.0/commoner_probe/schemas/answers_dfg_recommendation.schema.json +21 -0
  36. commoner_probe-0.4.0/commoner_probe/schemas/answers_qa_response.schema.json +27 -0
  37. commoner_probe-0.4.0/commoner_probe/schemas/atr_linkage.schema.json +19 -0
  38. commoner_probe-0.4.0/commoner_probe/schemas/committee_members.schema.json +43 -0
  39. commoner_probe-0.4.0/commoner_probe/schemas/entities_bureaucratic_posting.schema.json +19 -0
  40. commoner_probe-0.4.0/commoner_probe/schemas/entities_committee_membership.schema.json +18 -0
  41. commoner_probe-0.4.0/commoner_probe/schemas/entities_ministerial_appointment.schema.json +17 -0
  42. commoner_probe-0.4.0/commoner_probe/schemas/entities_mp_membership.schema.json +20 -0
  43. commoner_probe-0.4.0/commoner_probe/schemas/entities_person.schema.json +16 -0
  44. commoner_probe-0.4.0/commoner_probe/schemas/manifest_committee_report.schema.json +43 -0
  45. commoner_probe-0.4.0/commoner_probe/schemas/manifest_mca_csr.schema.json +72 -0
  46. commoner_probe-0.4.0/commoner_probe/schemas/manifest_mines_dmft.schema.json +94 -0
  47. commoner_probe-0.4.0/commoner_probe/schemas/manifest_qa.schema.json +58 -0
  48. commoner_probe-0.4.0/commoner_probe/schemas/runs.schema.json +39 -0
  49. commoner_probe-0.4.0/commoner_probe/schemas/state_assembly_member.schema.json +64 -0
  50. commoner_probe-0.4.0/commoner_probe/schemas/state_assembly_paper_laid.schema.json +24 -0
  51. commoner_probe-0.4.0/commoner_probe/schemas/state_assembly_question.schema.json +84 -0
  52. commoner_probe-0.4.0/commoner_probe/schemas/state_assembly_question_unlisted.schema.json +84 -0
  53. commoner_probe-0.4.0/commoner_probe/stats.py +235 -0
  54. commoner_probe-0.4.0/commoner_probe/textparse.py +79 -0
  55. commoner_probe-0.4.0/commoner_probe/topics.py +44 -0
  56. commoner_probe-0.4.0/commoner_probe/url_safety.py +82 -0
  57. commoner_probe-0.4.0/commoner_probe/validate.py +213 -0
  58. commoner_probe-0.4.0/commoner_probe.egg-info/PKG-INFO +531 -0
  59. commoner_probe-0.4.0/commoner_probe.egg-info/SOURCES.txt +93 -0
  60. commoner_probe-0.4.0/commoner_probe.egg-info/dependency_links.txt +1 -0
  61. commoner_probe-0.4.0/commoner_probe.egg-info/entry_points.txt +2 -0
  62. commoner_probe-0.4.0/commoner_probe.egg-info/requires.txt +18 -0
  63. commoner_probe-0.4.0/commoner_probe.egg-info/top_level.txt +1 -0
  64. commoner_probe-0.4.0/docs/ENDPOINTS.md +132 -0
  65. commoner_probe-0.4.0/docs/INTEGRATION_SMOKE.md +97 -0
  66. commoner_probe-0.4.0/docs/RATIONALE.md +154 -0
  67. commoner_probe-0.4.0/docs/SCHEMAS.md +668 -0
  68. commoner_probe-0.4.0/pyproject.toml +78 -0
  69. commoner_probe-0.4.0/setup.cfg +4 -0
  70. commoner_probe-0.4.0/tests/test_adapters.py +121 -0
  71. commoner_probe-0.4.0/tests/test_answers.py +348 -0
  72. commoner_probe-0.4.0/tests/test_atr_linkage.py +225 -0
  73. commoner_probe-0.4.0/tests/test_check_leaks.py +204 -0
  74. commoner_probe-0.4.0/tests/test_committees.py +348 -0
  75. commoner_probe-0.4.0/tests/test_corpus_loader.py +264 -0
  76. commoner_probe-0.4.0/tests/test_csr_mca.py +165 -0
  77. commoner_probe-0.4.0/tests/test_dmft_mines.py +193 -0
  78. commoner_probe-0.4.0/tests/test_docs_sync.py +71 -0
  79. commoner_probe-0.4.0/tests/test_entities.py +216 -0
  80. commoner_probe-0.4.0/tests/test_evidence_dmft.py +153 -0
  81. commoner_probe-0.4.0/tests/test_init_topic_cli.py +61 -0
  82. commoner_probe-0.4.0/tests/test_qa_structured_parse.py +212 -0
  83. commoner_probe-0.4.0/tests/test_report_type.py +221 -0
  84. commoner_probe-0.4.0/tests/test_resolve_askers.py +116 -0
  85. commoner_probe-0.4.0/tests/test_resolver.py +150 -0
  86. commoner_probe-0.4.0/tests/test_runlog.py +246 -0
  87. commoner_probe-0.4.0/tests/test_schemas.py +580 -0
  88. commoner_probe-0.4.0/tests/test_security_hardening.py +110 -0
  89. commoner_probe-0.4.0/tests/test_smoke_fixture.py +129 -0
  90. commoner_probe-0.4.0/tests/test_state_assembly.py +155 -0
  91. commoner_probe-0.4.0/tests/test_textparse.py +53 -0
  92. commoner_probe-0.4.0/tests/test_topics.py +62 -0
  93. commoner_probe-0.4.0/tests/test_url_encoding.py +52 -0
  94. commoner_probe-0.4.0/tests/test_url_safety.py +85 -0
  95. commoner_probe-0.4.0/tests/test_validate_cli.py +111 -0
@@ -0,0 +1,118 @@
1
+ # Changelog
2
+
3
+ ## 0.4.0 (2026-06-22)
4
+
5
+ ### Added
6
+
7
+ - **`commoner-probe mca-csr`** — download MCA CDM CSR company-spend CSV exports by financial year.
8
+ - **`manifest_mca_csr` schema** and `ManifestMcaCsrRecord` / `Corpus.manifest_mca_csr()` for typed access to MCA CSR manifest records.
9
+ - **`commoner-probe mines-dmft`** — acquire Ministry of Mines / Odisha DMFT public disclosure files with source provenance.
10
+ - **`commoner-probe evidence dmft`** — build side-by-side DMFT evidence bundles from executive disclosure and Sansad oversight records.
11
+ - **`docs/ENDPOINTS.md`** — public source-family endpoint reference.
12
+ - **`narcotics_substance` built-in topic** for NDPS, trafficking, and substance-abuse oversight records.
13
+
14
+ ### Changed
15
+
16
+ - **Relicensed**: AGPL-3.0-or-later → MIT, so `commoner-probe` can be the permissive shared acquisition floor that downstream repos (including the non-AGPL `sansad-semantic-crawler`) depend on without copyleft friction.
17
+ - `commoner_probe.csr.mca` now uses the verified MCA CDM live contract: `GET /csr-data` for the CSRF-bearing form and `POST /cdm/export.php` for CSV export.
18
+ - Public packaging now includes only release-facing docs; local coordination files (`notes/`, `memory/`, `.ai/`, `.beads/`, `.codex/`, `WORKING.md`, `TODO.md`) are ignored and removed from the tracked public tree.
19
+ - `scripts/check_leaks.py` now blocks private coordination paths if they are accidentally staged.
20
+
21
+ ## 0.3.0 (2026-06-06)
22
+
23
+ ### Breaking changes
24
+
25
+ - **Package renamed**: `sansad-crawler` → `commoner-probe`. Update your `pip install` and imports.
26
+ - Python: `from sansad_crawler import ...` → `from commoner_probe import ...`
27
+ - CLI: `sansad-crawl` → `commoner-probe`
28
+ - Subcommands renamed: `crawl` → `sansad`, `crawl-committees` → `committees`, `extract-atr-linkage` → `atr-linkage`
29
+ - **New subcommand added**: `state-assembly` (NeVA state assembly portals)
30
+ - **Schema field renamed**: `crawled_at` → `probed_at` in all output records
31
+ - **Relicensed**: MIT → AGPL-3.0-or-later
32
+
33
+ ### Added
34
+
35
+ - **`commoner-probe state-assembly`** — probe NeVA state assembly portals (`{portal}.neva.gov.in`). Writes `questions.jsonl`, `questions_unlisted.jsonl`, `members.jsonl`, `papers_laid.jsonl`. Tested on Gujarat assembly 15.
36
+ - **HTTP hardening** (`commoner_probe/http_client.py`): SSRF guard, robots.txt checking, per-domain rate limiting (1 req/s), exponential backoff (3 retries), optional `requests_cache` (6h TTL). Install via `pip install commoner-probe[cache]`.
37
+ - **Committee composition** (`CommitteeProbe.probe_composition()`): writes `committee_members.jsonl`.
38
+ - **`filter_fn` hook on `TopicProfile`**: callable injected by analytics layer at runtime.
39
+ - **`classifier_config` in `TopicProfile`**: propagated to `_runs.jsonl` for corpus auditability.
40
+ - **JSON schemas for new outputs**: `committee_members`, `state_assembly_question`, `state_assembly_question_unlisted`, `state_assembly_member`, `state_assembly_paper_laid`.
41
+ - **`commoner-probe init-topic`**: write a bundled example topic profile to disk (built-ins: `libraries`, `home_affairs_starred`, `affirmative_action`).
42
+ - **Single-sourced version**: `__version__` reads from `importlib.metadata` with pyproject fallback.
43
+ - **GitHub Actions**: CI (matrix 3.10–3.12, ruff, pytest) and OIDC PyPI release workflow.
44
+ - **`MANIFEST.in`**, **`CONTRIBUTING.md`**, **`CODE_OF_CONDUCT.md`** (Contributor Covenant v2.1).
45
+
46
+ ### Changed
47
+
48
+ - Base class `BaseCrawler` → `BaseProbe`; `crawl_ls`/`crawl_rs` → `probe_ls`/`probe_rs`; `crawl_composition` → `probe_composition`.
49
+ - User-Agent: `commoner-probe/0.3.0`.
50
+ - HTTP cache env var: `COMMONER_CACHE_DIR` (was `SANSAD_CACHE_DIR`; old name still honoured with deprecation warning).
51
+
52
+ ---
53
+
54
+ ## 0.2.0 (2026-05-21)
55
+
56
+ ### Added
57
+
58
+ - **`docs/SCHEMAS.md`** — complete field-level reference for every output
59
+ stream: all four manifest record shapes (LS Q/A, RS Q/A, LS committee,
60
+ RS committee), `_runs.jsonl`, three `answers.jsonl` kinds,
61
+ `atr_linkage.jsonl`, and five `entities/*.jsonl` files. Includes
62
+ controlled vocabularies and join-key documentation.
63
+
64
+ - **JSON Schemas** — twelve Draft-2020-12 schemas shipped as package data
65
+ under `sansad_crawler/schemas/`. Exposed via
66
+ `sansad_crawler.schemas.load(name)` and `schemas.list_all()`.
67
+
68
+ - **`sansad_crawler/records.py`** — typed dataclasses for every record kind
69
+ (`ManifestQaRecord`, `ManifestCommitteeReportRecord`, `AnswerQaResponse`,
70
+ `AnswerAtrResponse`, `AnswerDfgRecommendation`, `AtrLinkageRecord`,
71
+ `RunRecord`). Each has `from_dict()` that tolerates unknown keys and
72
+ missing optional fields.
73
+
74
+ - **`sansad_crawler/corpus.py`** — `Corpus` class with streaming iterators
75
+ (`manifest_qa`, `manifest_committee_reports`, `answers_qa`, `answers_atr`,
76
+ `answers_dfg`, `atr_linkages`, `runs`, `entities`), join helpers
77
+ (`join_qa`, `join_atr_chain`), and an opt-in `to_dataframe(stream)` that
78
+ requires `pip install sansad-crawler[pandas]`.
79
+
80
+ - **`sansad-crawl stats`** — new CLI subcommand that prints corpus health:
81
+ record counts by house/year/ministry/committee/report_type, answers
82
+ extraction coverage, entity resolution rate, and date ranges. Use
83
+ `--json` for machine-readable output.
84
+
85
+ - **`sansad-crawl validate`** — new CLI subcommand that validates every
86
+ JSONL file in a corpus against its JSON Schema. Requires
87
+ `pip install sansad-crawler[dev]`. Prints line numbers and JSON pointers
88
+ on failure; exits 1 on any error.
89
+
90
+ - **`[dev]` optional-dependency group** — `jsonschema>=4.20` and
91
+ `pytest>=7`. Install with `pip install sansad-crawler[dev]`.
92
+
93
+ - **`[pandas]` optional-dependency group** — `pandas>=2.0`. Install with
94
+ `pip install sansad-crawler[pandas]`.
95
+
96
+ - **`examples/usage.py`** — demonstration script for the `Corpus` API.
97
+
98
+ ### Changed (non-breaking)
99
+
100
+ - `sansad_crawler.__init__` now re-exports `Corpus`, `QaPair`, `AtrChain`,
101
+ all record dataclasses, and the `schemas` module.
102
+ - `run_id` and `crawled_at` in manifest schemas changed from `required` to
103
+ optional (always present in freshly crawled corpora; may be absent in
104
+ synthetic or backfilled data).
105
+
106
+ ### Unchanged
107
+
108
+ Crawler behaviour, extractor logic, and manifest field set are unchanged.
109
+ All corpora produced by v0.1.0 load and validate cleanly under v0.2.0.
110
+
111
+ ---
112
+
113
+ ## 0.1.0 (2026-05-21)
114
+
115
+ Initial release. Lok Sabha + Rajya Sabha Q/A crawler, standing-committee
116
+ report crawler, regex-based Q/A and ATR extractors, ATR linkage extractor,
117
+ entity resolution, four CLI subcommands (`crawl`, `crawl-committees`,
118
+ `extract-answers`, `extract-atr-linkage`).
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Commoner LLP
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,13 @@
1
+ include LICENSE
2
+ include README.md
3
+ include CHANGELOG.md
4
+ include docs/SCHEMAS.md
5
+ include docs/INTEGRATION_SMOKE.md
6
+ include docs/RATIONALE.md
7
+ include docs/ENDPOINTS.md
8
+ prune .beads
9
+ prune .ai
10
+ prune .claude
11
+ prune .codex
12
+ prune notes
13
+ prune memory
@@ -0,0 +1,531 @@
1
+ Metadata-Version: 2.4
2
+ Name: commoner-probe
3
+ Version: 0.4.0
4
+ Summary: Sousveillance infrastructure for state mandatory-disclosure portals — parliamentary questions, committee reports, budget data, and state assembly records.
5
+ Author: Sreeram Ramasubramanian
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/CommonerLLP/commoner-probe
8
+ Project-URL: Source, https://github.com/CommonerLLP/commoner-probe
9
+ Project-URL: Issues, https://github.com/CommonerLLP/commoner-probe/issues
10
+ Project-URL: Documentation, https://github.com/CommonerLLP/commoner-probe/blob/master/README.md
11
+ Project-URL: Changelog, https://github.com/CommonerLLP/commoner-probe/blob/master/CHANGELOG.md
12
+ Keywords: parliament-of-india,lok-sabha,rajya-sabha,parliamentary-questions,standing-committees,state-assembly,neva,public-records,civic-tech,sousveillance,counter-forensics,right-to-information,open-government,data-justice
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Topic :: Sociology
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Operating System :: OS Independent
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Provides-Extra: pdf
25
+ Requires-Dist: pdfminer.six>=20231228; extra == "pdf"
26
+ Provides-Extra: http
27
+ Requires-Dist: requests>=2.31.0; extra == "http"
28
+ Provides-Extra: pandas
29
+ Requires-Dist: pandas>=2.0; extra == "pandas"
30
+ Provides-Extra: dev
31
+ Requires-Dist: jsonschema>=4.20; extra == "dev"
32
+ Requires-Dist: pytest>=7; extra == "dev"
33
+ Requires-Dist: ruff>=0.4; extra == "dev"
34
+ Provides-Extra: all
35
+ Requires-Dist: pdfminer.six>=20231228; extra == "all"
36
+ Requires-Dist: requests>=2.31.0; extra == "all"
37
+ Dynamic: license-file
38
+
39
+ # commoner-probe
40
+
41
+ Sousveillance infrastructure for the state's mandatory disclosure systems.
42
+
43
+ A commoner probes the state's own paperwork — parliamentary questions, committee
44
+ reports, state assembly records — and turns it into evidence. `commoner-probe`
45
+ automates the acquisition so you can focus on the analysis.
46
+
47
+ ```bash
48
+ pip install "commoner-probe[all]"
49
+ import commoner_probe as probe # alias used throughout CommonerLLP toolchain
50
+ ```
51
+
52
+ ---
53
+
54
+ ## Why this exists
55
+
56
+ Parliamentary questions, committee reports, state assembly records, CSR
57
+ exports, and public mining-district disclosures are mandatory or official
58
+ public disclosures. The data exists. The problem
59
+ is that it lives across undocumented portals with inconsistent APIs, no bulk
60
+ export, and PDFs that require extraction to read programmatically.
61
+
62
+ `commoner-probe` handles the entire acquisition pipeline:
63
+
64
+ ```
65
+ public disclosure portals → manifest.jsonl → files/PDFs → extracted records → your analysis
66
+ (metadata) (raw source) (structured text)
67
+ ```
68
+
69
+ Classification, topic modelling, and dossier generation are intentionally out
70
+ of scope. This library does one thing: acquire public disclosure data into
71
+ provenance-rich, schema-validated JSONL and source files.
72
+
73
+ ---
74
+
75
+ ## Install
76
+
77
+ ```bash
78
+ pip install "commoner-probe[all]" # requests + PDF extraction
79
+ pip install "commoner-probe[all,dev]" # + schema validation and tests
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Five-minute quickstart
85
+
86
+ ### Step 1 — Write a topic profile
87
+
88
+ ```json
89
+ {
90
+ "name": "climate",
91
+ "description": "Climate change and environmental policy",
92
+ "search_groups": {
93
+ "climate": ["climate change", "global warming", "net zero"],
94
+ "air_quality": ["air pollution", "AQI", "particulate matter"]
95
+ },
96
+ "lok_sabha_ministries": ["ENVIRONMENT", "POWER", "PETROLEUM"],
97
+ "rajya_sabha_ministry_likes": ["ENVIRONMENT", "POWER", "PETROLEUM"]
98
+ }
99
+ ```
100
+
101
+ ### Step 2 — Probe parliamentary questions
102
+
103
+ ```bash
104
+ commoner-probe sansad \
105
+ --topic topic.json \
106
+ --out data/climate \
107
+ --house both \
108
+ --from-date 2019-01-01
109
+ ```
110
+
111
+ Writes `data/climate/manifest.jsonl` — one record per question from both houses.
112
+
113
+ ### Step 3 — Probe committee reports
114
+
115
+ ```bash
116
+ commoner-probe committees \
117
+ --topic topic.json \
118
+ --out data/climate-committees \
119
+ --house both
120
+ ```
121
+
122
+ One record per standing committee report (LS and RS DRSCs).
123
+
124
+ ### Step 4 — Extract text from PDFs
125
+
126
+ ```bash
127
+ commoner-probe extract-answers --out data/climate
128
+ commoner-probe extract-answers --out data/climate-committees
129
+ ```
130
+
131
+ Parses downloaded PDFs into `answers.jsonl`: Q/A pairs, committee
132
+ recommendations, and government responses.
133
+
134
+ ### Step 5 — Load in Python
135
+
136
+ ```python
137
+ import commoner_probe as probe
138
+
139
+ c = probe.Corpus("data/climate")
140
+
141
+ for r in c.manifest_qa():
142
+ print(r.date, r.house, r.ministry, r.title)
143
+
144
+ for pair in c.join_qa():
145
+ if pair.answers:
146
+ print(pair.manifest.title)
147
+ print(pair.answers[0].question_text[:200])
148
+ ```
149
+
150
+ ---
151
+
152
+ ## What you can study
153
+
154
+ ### Parliamentary questions (Lok Sabha + Rajya Sabha)
155
+
156
+ Each record carries who asked (MP name, party, state), which ministry answered,
157
+ question number, type (starred / unstarred), date, session, and the full PDF.
158
+ After `extract-answers` — extracted question and answer text.
159
+
160
+ **Typical research questions**: ministry responsiveness rates, which MPs ask
161
+ the most questions by topic, how the same policy question evolves across
162
+ sessions, party-level questioning patterns.
163
+
164
+ ```python
165
+ import commoner_probe as probe
166
+ from collections import Counter
167
+
168
+ c = probe.Corpus("data/climate")
169
+ ministry_counts = Counter(r.ministry for r in c.manifest_qa())
170
+ for ministry, n in ministry_counts.most_common(10):
171
+ print(f"{ministry}: {n}")
172
+ ```
173
+
174
+ ### Standing committee reports (LS + RS DRSCs)
175
+
176
+ Committee reports come in four shapes:
177
+
178
+ | `report_type` | What it is |
179
+ |---|---|
180
+ | `demands_for_grants` | Annual budget scrutiny — the committee dissects ministry spending |
181
+ | `bill` | The committee's examination of a pending bill before it passes |
182
+ | `subject` | Own-initiative policy investigation — deepest substantive record |
183
+ | `action_taken` | The government's formal response to the committee's recommendations |
184
+
185
+ Action Taken Reports (ATRs) are the government's formal written responses to
186
+ committee recommendations. The `atr-linkage` command connects each ATR back
187
+ to the original report, enabling lifecycle analysis:
188
+ *recommendation → government rejection/acceptance → follow-up*.
189
+
190
+ ```python
191
+ import commoner_probe as probe
192
+
193
+ c = probe.Corpus("data/climate-committees")
194
+
195
+ for chain in c.join_atr_chain():
196
+ print(f"Report: {chain.original and chain.original.title}")
197
+ print(f" Recommendations: {len(chain.original_observations)}")
198
+ print(f" Government responses: {len(chain.atr_answers)}")
199
+ ```
200
+
201
+ ### State assembly records (NeVA portals)
202
+
203
+ From 2020, sub-national governments have been adopting NIC's NeVA (National
204
+ e-Vidhan Application) infrastructure under a centrally sponsored scheme run
205
+ by the Ministry of Parliamentary Affairs. Most state assemblies are onboarding,
206
+ though coverage varies. The `state-assembly` command probes any NeVA portal:
207
+
208
+ ```bash
209
+ commoner-probe state-assembly \
210
+ --portal gujarat \
211
+ --state GJ \
212
+ --out data/gujarat-assembly \
213
+ --assemblies 15
214
+ ```
215
+
216
+ ### MCA CSR company-spend exports
217
+
218
+ The Ministry of Corporate Affairs CDM CSR data page exposes downloadable CSV
219
+ exports by financial year. These records compare reporting/spending companies
220
+ and project-sector amounts. They do not identify CSR consultants or implementing
221
+ agencies unless MCA publishes that in the source export.
222
+
223
+ ```bash
224
+ commoner-probe mca-csr \
225
+ --out data/mca-csr \
226
+ --years 2022-23,2021-22
227
+ ```
228
+
229
+ ```python
230
+ import commoner_probe as probe
231
+
232
+ c = probe.Corpus("data/mca-csr")
233
+ for r in c.manifest_mca_csr():
234
+ print(r.financial_year, r.status, r.filename)
235
+ ```
236
+
237
+ ### Mines DMFT / PMKKKY disclosures
238
+
239
+ `mines-dmft` acquires raw Ministry of Mines and Odisha DMFT public disclosure
240
+ files. Ministry CSVs are current cumulative snapshots timestamped by the
241
+ source; treat them as snapshots, not fiscal-year series.
242
+
243
+ ```bash
244
+ commoner-probe mines-dmft \
245
+ --out data/mines-dmft \
246
+ --sources mines-gov-in,odisha
247
+ ```
248
+
249
+ Pair the executive disclosure snapshots with Sansad oversight records without
250
+ flattening the source families:
251
+
252
+ ```bash
253
+ commoner-probe evidence dmft \
254
+ --mines-dmft-dir data/mines-dmft \
255
+ --sansad-dir data/sansad/mines-dmft-pmkkky \
256
+ --out data/evidence/dmft.json
257
+ ```
258
+
259
+ ---
260
+
261
+ ## All commands
262
+
263
+ ### `commoner-probe sansad` — parliamentary questions
264
+
265
+ ```bash
266
+ commoner-probe sansad \
267
+ --topic topic.json \
268
+ --out data/climate \
269
+ --house both \
270
+ --from-date 2019-01-01 \
271
+ --to-date 2026-01-01
272
+ ```
273
+
274
+ | Flag | Default | What it does |
275
+ |---|---|---|
276
+ | `--topic` | required | Path to topic profile JSON |
277
+ | `--out` | required | Output corpus directory |
278
+ | `--house` | `both` | `ls`, `rs`, or `both` |
279
+ | `--from-date` | — | Earliest question date (YYYY-MM-DD) |
280
+ | `--to-date` | — | Latest question date |
281
+ | `--qtype` | `both` | `starred`, `unstarred`, or `both` |
282
+ | `--sessions` | `1-267` | Rajya Sabha session range |
283
+ | `--no-download` | off | Skip PDF downloads; metadata only |
284
+ | `--with-entities` | off | Resolve asker names to stable entity IDs |
285
+ | `--max-records N` | — | Stop after N new records per house (smoke-test) |
286
+ | `--max-buckets N` | — | Only run the first N search/ministry combos |
287
+ | `--reset` | off | Wipe existing manifest and start fresh |
288
+
289
+ ### `commoner-probe committees` — standing committee reports
290
+
291
+ ```bash
292
+ commoner-probe committees \
293
+ --topic topic.json \
294
+ --out data/committees \
295
+ --house both \
296
+ --committees finance,education
297
+ ```
298
+
299
+ | Flag | Default | What it does |
300
+ |---|---|---|
301
+ | `--committees` | all | Comma-separated committee slugs |
302
+ | `--lok-sabha-no` | `18` | LS number for LS reports |
303
+ | `--from-date` / `--to-date` | — | Date range filter |
304
+ | `--no-download` | off | Skip PDF downloads |
305
+
306
+ **Available LS committees** (16 DRSCs):
307
+ `agriculture`, `chemicals`, `coal`, `communications`, `consumer_affairs`,
308
+ `defence`, `energy`, `external_affairs`, `finance`, `housing`, `labour`,
309
+ `petroleum`, `railways`, `rural_development`, `social_justice`, `water_resources`
310
+
311
+ **Available RS committees** (8 DRSCs):
312
+ `commerce`, `education`, `health`, `home_affairs`, `industry`, `personnel`,
313
+ `science`, `transport`
314
+
315
+ ### `commoner-probe extract-answers` — PDF text extraction
316
+
317
+ ```bash
318
+ commoner-probe extract-answers --out data/climate
319
+ commoner-probe extract-answers --out data/climate --refresh
320
+ ```
321
+
322
+ Reads `manifest.jsonl` and downloaded PDFs; writes `answers.jsonl` with:
323
+
324
+ - `qa_response` — (question_text, answer_text) pairs from Q/A PDFs
325
+ - `atr_response` — (recommendation_no, recommendation_text, response_text) triples from ATR PDFs
326
+ - `dfg_recommendation` — numbered observation paragraphs from DFG/Bill/Subject PDFs
327
+
328
+ Requires `pip install "commoner-probe[pdf]"`.
329
+
330
+ ### `commoner-probe atr-linkage` — ATR → original report
331
+
332
+ ```bash
333
+ commoner-probe atr-linkage --out data/committees
334
+ ```
335
+
336
+ Writes `atr_linkage.jsonl` — each ATR linked back to the report it responds to.
337
+ Safe to re-run (idempotent overwrite).
338
+
339
+ ### `commoner-probe state-assembly` — state legislature records
340
+
341
+ ```bash
342
+ commoner-probe state-assembly \
343
+ --portal gujarat \
344
+ --state GJ \
345
+ --out data/gujarat \
346
+ --assemblies 15
347
+ ```
348
+
349
+ ### `commoner-probe mca-csr` — MCA CSR company-spend exports
350
+
351
+ ```bash
352
+ commoner-probe mca-csr \
353
+ --out data/mca-csr \
354
+ --years 2022-23
355
+ ```
356
+
357
+ Downloads CSV exports from the MCA CDM CSR data page and writes one
358
+ `manifest.jsonl` record per financial year. Use `--dry-run` to print manifest
359
+ records without opening a network session.
360
+
361
+ ### `commoner-probe mines-dmft` — Ministry of Mines / DMFT files
362
+
363
+ ```bash
364
+ commoner-probe mines-dmft \
365
+ --out data/mines-dmft \
366
+ --sources mines-gov-in,odisha
367
+ ```
368
+
369
+ Downloads raw Ministry of Mines static CSV snapshots and Odisha DMFT public
370
+ JSON/report surfaces. Use `--dry-run` to print manifest records without opening
371
+ network sessions.
372
+
373
+ ### `commoner-probe evidence dmft` — cross-source evidence bundle
374
+
375
+ ```bash
376
+ commoner-probe evidence dmft \
377
+ --mines-dmft-dir data/mines-dmft \
378
+ --sansad-dir data/sansad/mines-dmft-pmkkky \
379
+ --out data/evidence/dmft.json
380
+ ```
381
+
382
+ Builds a JSON bundle with separate `executive_disclosure` and
383
+ `parliamentary_oversight` sections. It does not merge unlike source families
384
+ into one table.
385
+
386
+ ### `commoner-probe stats` — corpus health
387
+
388
+ ```bash
389
+ commoner-probe stats --out data/climate
390
+ commoner-probe stats --out data/climate --json
391
+ ```
392
+
393
+ ### `commoner-probe validate` — schema validation
394
+
395
+ ```bash
396
+ commoner-probe validate --out data/climate
397
+ ```
398
+
399
+ Validates every JSONL file against its JSON Schema. Exits 1 on errors.
400
+ Requires `[dev]` extra.
401
+
402
+ ---
403
+
404
+ ## Topic profile
405
+
406
+ Controls what the probe acquires:
407
+
408
+ ```json
409
+ {
410
+ "name": "libraries",
411
+ "description": "Public library infrastructure and policy",
412
+ "search_groups": {
413
+ "public_libraries": ["public library", "rural library"],
414
+ "policy": ["National Mission on Libraries", "RRRLF"]
415
+ },
416
+ "lok_sabha_ministries": ["CULTURE", "EDUCATION"],
417
+ "rajya_sabha_ministry_likes": ["CULTURE", "EDUCATION"]
418
+ }
419
+ ```
420
+
421
+ - `search_groups` — keyword groups for LS full-text search. Each query runs
422
+ independently; results are union-deduped on `key`.
423
+ - `lok_sabha_ministries` — exact ministry filter for LS (case-sensitive).
424
+ - `rajya_sabha_ministry_likes` — ministry LIKE filter for RS (prefix match).
425
+
426
+ See `examples/topics/` for working examples.
427
+
428
+ ---
429
+
430
+ ## Output files
431
+
432
+ | File | Contents |
433
+ |------|----------|
434
+ | `manifest.jsonl` | One record per question or committee report |
435
+ | `_runs.jsonl` | Audit log: scope, topic hash, errors, per-bucket counts |
436
+ | `answers.jsonl` | Extracted Q/A and recommendation/response pairs |
437
+ | `atr_linkage.jsonl` | ATR → original report linkages |
438
+ | source CSV/JSON/HTML files | Raw source files for source-specific probes such as MCA CSR and DMFT |
439
+ | `pdfs/ls/` | Downloaded LS PDFs |
440
+ | `pdfs/rs/` | Downloaded RS PDFs |
441
+ | `probe.log` | Human-readable probe progress log |
442
+
443
+ For complete field-level documentation see [`docs/SCHEMAS.md`](docs/SCHEMAS.md).
444
+
445
+ ---
446
+
447
+ ## Entity resolution (`--with-entities`)
448
+
449
+ Pass `--with-entities` to `commoner-probe sansad` to resolve asker names to
450
+ stable `entity_id` values. On first run the entity store is populated from
451
+ the sansad.in MP roster; subsequent runs reuse the local cache.
452
+
453
+ Resolved entity IDs join across corpora and sessions — useful for studying
454
+ the same MP's questioning behaviour over time or across houses.
455
+
456
+ ---
457
+
458
+ ## Python API
459
+
460
+ ```python
461
+ import commoner_probe as probe
462
+
463
+ c = probe.Corpus("data/climate")
464
+
465
+ # Typed iterators
466
+ for r in c.manifest_qa(): # ManifestQaRecord
467
+ ...
468
+ for r in c.manifest_committee_reports(): # ManifestCommitteeReportRecord
469
+ ...
470
+ for r in c.answers_qa(): # AnswerQaResponse
471
+ ...
472
+ for r in c.answers_atr(): # AnswerAtrResponse
473
+ ...
474
+ for r in c.answers_dfg(): # AnswerDfgRecommendation
475
+ ...
476
+ for r in c.atr_linkages(): # AtrLinkageRecord
477
+ ...
478
+ for r in c.manifest_mca_csr(): # ManifestMcaCsrRecord
479
+ ...
480
+ for r in c.manifest_mines_dmft(): # ManifestMinesDmftRecord
481
+ ...
482
+ for r in c.runs(): # RunRecord
483
+ ...
484
+
485
+ # Join helpers
486
+ for pair in c.join_qa(): # manifest + extracted answers
487
+ ...
488
+ for chain in c.join_atr_chain(): # ATR + original report + observations
489
+ ...
490
+
491
+ # pandas (pip install commoner-probe[pandas])
492
+ df = c.to_dataframe("manifest_committee_reports")
493
+ ```
494
+
495
+ See [`examples/usage.py`](examples/usage.py) for a runnable walkthrough.
496
+ See [`docs/ENDPOINTS.md`](docs/ENDPOINTS.md) for source-family endpoint notes.
497
+
498
+ ---
499
+
500
+ ## License
501
+
502
+ MIT License — see [`LICENSE`](LICENSE).
503
+
504
+ `commoner-probe` is sousveillance infrastructure, built for the commons. It is
505
+ released under the permissive MIT license so it can serve as a shared
506
+ acquisition floor that any downstream project — including the other repos in the
507
+ CommonerLLP federation, whatever their own licenses — can build on without
508
+ copyleft friction.
509
+
510
+ ---
511
+
512
+ ## Upcoming
513
+
514
+ ### Floor debates
515
+
516
+ sansad.in exposes full debate proceedings via `api_ls/debate/text-of-debate`
517
+ (structured JSON, 17th Lok Sabha onwards). Each record covers a single day:
518
+ type of business, member who spoke, and verbatim text. The richest longitudinal
519
+ record of what MPs say on the floor.
520
+
521
+ ### Bills and legislation
522
+
523
+ `sansad.in/ls/legislation/bills` lists every bill since independence with
524
+ introduction date, debate dates, and status at each stage. Enables tracking
525
+ legislative velocity, committee scrutiny rates, and private member bill outcomes.
526
+
527
+ ### MP profiles and career timelines
528
+
529
+ Structured biographical data for each member: constituency, state, party, terms
530
+ served, educational background, declared profession. Pairs with the Q/A corpus
531
+ for studies of how MP background predicts parliamentary participation.