commoner-probe 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. commoner_probe/__init__.py +62 -0
  2. commoner_probe/__main__.py +5 -0
  3. commoner_probe/answers.py +598 -0
  4. commoner_probe/atr_linkage.py +275 -0
  5. commoner_probe/base.py +169 -0
  6. commoner_probe/cli.py +466 -0
  7. commoner_probe/committees.py +603 -0
  8. commoner_probe/corpus.py +312 -0
  9. commoner_probe/csr/__init__.py +6 -0
  10. commoner_probe/csr/mca.py +178 -0
  11. commoner_probe/dmft/__init__.py +3 -0
  12. commoner_probe/dmft/mines.py +238 -0
  13. commoner_probe/entities.py +440 -0
  14. commoner_probe/evidence.py +250 -0
  15. commoner_probe/example_topics/__init__.py +23 -0
  16. commoner_probe/example_topics/affirmative_action.json +82 -0
  17. commoner_probe/example_topics/home_affairs_starred.json +31 -0
  18. commoner_probe/example_topics/libraries.json +66 -0
  19. commoner_probe/example_topics/mines_dmft_pmkkky.json +26 -0
  20. commoner_probe/example_topics/narcotics_substance.json +44 -0
  21. commoner_probe/http_client.py +206 -0
  22. commoner_probe/members.py +127 -0
  23. commoner_probe/neva.py +663 -0
  24. commoner_probe/records.py +350 -0
  25. commoner_probe/resolver.py +169 -0
  26. commoner_probe/runlog.py +189 -0
  27. commoner_probe/sansad.py +469 -0
  28. commoner_probe/schemas/__init__.py +60 -0
  29. commoner_probe/schemas/answers_atr_response.schema.json +22 -0
  30. commoner_probe/schemas/answers_dfg_recommendation.schema.json +21 -0
  31. commoner_probe/schemas/answers_qa_response.schema.json +27 -0
  32. commoner_probe/schemas/atr_linkage.schema.json +19 -0
  33. commoner_probe/schemas/committee_members.schema.json +43 -0
  34. commoner_probe/schemas/entities_bureaucratic_posting.schema.json +19 -0
  35. commoner_probe/schemas/entities_committee_membership.schema.json +18 -0
  36. commoner_probe/schemas/entities_ministerial_appointment.schema.json +17 -0
  37. commoner_probe/schemas/entities_mp_membership.schema.json +20 -0
  38. commoner_probe/schemas/entities_person.schema.json +16 -0
  39. commoner_probe/schemas/manifest_committee_report.schema.json +43 -0
  40. commoner_probe/schemas/manifest_mca_csr.schema.json +72 -0
  41. commoner_probe/schemas/manifest_mines_dmft.schema.json +94 -0
  42. commoner_probe/schemas/manifest_qa.schema.json +58 -0
  43. commoner_probe/schemas/runs.schema.json +39 -0
  44. commoner_probe/schemas/state_assembly_member.schema.json +64 -0
  45. commoner_probe/schemas/state_assembly_paper_laid.schema.json +24 -0
  46. commoner_probe/schemas/state_assembly_question.schema.json +84 -0
  47. commoner_probe/schemas/state_assembly_question_unlisted.schema.json +84 -0
  48. commoner_probe/stats.py +235 -0
  49. commoner_probe/textparse.py +79 -0
  50. commoner_probe/topics.py +44 -0
  51. commoner_probe/url_safety.py +82 -0
  52. commoner_probe/validate.py +213 -0
  53. commoner_probe-0.4.0.dist-info/METADATA +531 -0
  54. commoner_probe-0.4.0.dist-info/RECORD +58 -0
  55. commoner_probe-0.4.0.dist-info/WHEEL +5 -0
  56. commoner_probe-0.4.0.dist-info/entry_points.txt +2 -0
  57. commoner_probe-0.4.0.dist-info/licenses/LICENSE +21 -0
  58. commoner_probe-0.4.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,62 @@
1
+ # SPDX-License-Identifier: MIT
2
+ """Data-pulling crawler for Indian Parliament question corpora."""
3
+
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from . import schemas as schemas
8
+ from .corpus import AtrChain, Corpus, QaPair # noqa: F401
9
+ from .records import ( # noqa: F401
10
+ AnswerAtrResponse,
11
+ AnswerDfgRecommendation,
12
+ AnswerQaResponse,
13
+ AtrLinkageRecord,
14
+ BureaucraticPosting,
15
+ CommitteeMembership,
16
+ ManifestCommitteeReportRecord,
17
+ ManifestMinesDmftRecord,
18
+ ManifestQaRecord,
19
+ MinisterialAppointment,
20
+ MpMembership,
21
+ Person,
22
+ RunRecord,
23
+ )
24
+
25
+ __all__ = [
26
+ "__version__",
27
+ "schemas",
28
+ "Corpus",
29
+ "QaPair",
30
+ "AtrChain",
31
+ "ManifestQaRecord",
32
+ "ManifestCommitteeReportRecord",
33
+ "ManifestMinesDmftRecord",
34
+ "AnswerQaResponse",
35
+ "AnswerAtrResponse",
36
+ "AnswerDfgRecommendation",
37
+ "AtrLinkageRecord",
38
+ "RunRecord",
39
+ "Person",
40
+ "MpMembership",
41
+ "CommitteeMembership",
42
+ "MinisterialAppointment",
43
+ "BureaucraticPosting",
44
+ ]
45
+ def _version_from_pyproject() -> str | None:
46
+ """Fallback for source-tree runs before the package is installed."""
47
+ pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
48
+ if not pyproject.exists():
49
+ return None
50
+ match = re.search(r'^version = "([^"]+)"$', pyproject.read_text(encoding="utf-8"), re.MULTILINE)
51
+ return match.group(1) if match else None
52
+
53
+
54
+ def _resolve_version() -> str:
55
+ try:
56
+ from importlib.metadata import version as _dist_version
57
+ return _dist_version("commoner-probe")
58
+ except Exception:
59
+ return _version_from_pyproject() or "0.0.0"
60
+
61
+
62
+ __version__ = _resolve_version()
@@ -0,0 +1,5 @@
1
+ # SPDX-License-Identifier: MIT
2
+ from .cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()
@@ -0,0 +1,598 @@
1
+ # SPDX-License-Identifier: MIT
2
+ """Phase 1: structured text extraction from parliamentary PDFs.
3
+
4
+ Three shapes of source PDF, three extractors:
5
+
6
+ * **Q/A PDFs** (parliamentary questions): split a single
7
+ question+answer into ``(question_text, answer_text)`` on the
8
+ "Reply by ..." / "Answer" boundary. Output: one record per source.
9
+
10
+ * **ATR PDFs** (Action-Taken Reports — government's response to a
11
+ prior committee report): split into
12
+ ``[(recommendation_no, recommendation_text, response_text), ...]``
13
+ on "Recommendation No. X" / "Reply of the Government" boundaries.
14
+ One source PDF → many records.
15
+
16
+ * **DFG / original committee reports**: find the
17
+ ``OBSERVATIONS/RECOMMENDATIONS OF THE COMMITTEE`` section, split on
18
+ numbered paragraphs. Output: ``[(recommendation_no,
19
+ recommendation_text), ...]`` — no response text because the
20
+ executive hasn't replied yet (that arrives in a future ATR).
21
+
22
+ This module is **extraction only**. Classification (counterinsurgency
23
+ labels) is Phase 2 in ``discourse.py``.
24
+
25
+ Schema commitments for ``answers.jsonl`` are documented in the file's
26
+ header comments and in `notes/PLAN_v0.5.0_SCOPE.md`.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import json
32
+ import re
33
+ from dataclasses import dataclass, field
34
+ from datetime import datetime, timezone
35
+ from pathlib import Path
36
+
37
+ from .textparse import extract_pdf_text, read_jsonl
38
+
39
+ EXTRACTOR_VERSION = "answers_regex_v1"
40
+
41
+
42
+ def _now() -> str:
43
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
44
+
45
+
46
+ def _clean(text: str) -> str:
47
+ """Normalise PDF whitespace artefacts: collapse runs of spaces, strip
48
+ page-number boilerplate, but preserve paragraph boundaries.
49
+ """
50
+ if not text:
51
+ return ""
52
+ # Strip page numbers that often appear as standalone numeric lines.
53
+ text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
54
+ # Drop "(Para 2.15)" style cross-references — they're metadata, not text.
55
+ text = re.sub(r"\(Para\s+\d+(?:\.\d+)*\)", "", text)
56
+ # Collapse spaces within lines but keep newlines (paragraph structure).
57
+ text = re.sub(r"[ \t]+", " ", text)
58
+ text = re.sub(r"\n{3,}", "\n\n", text)
59
+ return text.strip()
60
+
61
+
62
+ # -------------------------------------------------------------------------
63
+ # Q/A extractor
64
+ # -------------------------------------------------------------------------
65
+
66
+ # Markers indicating the boundary between question text and answer text in
67
+ # parliamentary Q/A PDFs. Order matters — earlier patterns are more
68
+ # specific and tried first.
69
+ _QA_REPLY_PATTERNS = [
70
+ r"^\s*ANSWER\s*$", # Bare "ANSWER" header line
71
+ r"^\s*REPLY\s*$", # Bare "REPLY"
72
+ r"\bTO\s+BE\s+ANSWERED\s+ON\b", # Often followed by date, then answer
73
+ r"^\s*Reply\s+by\b.{0,200}?:", # "Reply by [Minister name]:"
74
+ r"^\s*Answer\s+by\b.{0,200}?:",
75
+ r"\bSHRI\b.{0,60}\b(?:MINISTER|MOS)\b", # "SHRI X, MINISTER OF Y"
76
+ ]
77
+ _QA_REPLY_RE = re.compile(
78
+ "|".join(f"({p})" for p in _QA_REPLY_PATTERNS),
79
+ re.IGNORECASE | re.MULTILINE,
80
+ )
81
+
82
+
83
+ @dataclass
84
+ class QaExtraction:
85
+ question_text: str # Full raw question half (incl. boilerplate)
86
+ answer_text: str # Full raw answer half (incl. minister preamble)
87
+ confidence: float
88
+ extractor: str = EXTRACTOR_VERSION
89
+ boundary_marker: str = ""
90
+ # v0.6.5 — structured sub-fields derived from question_text/answer_text.
91
+ # Additive: legacy consumers reading question_text / answer_text are
92
+ # unaffected. Designed for the v0.7.0 mp-draft bridge feature, which
93
+ # needs clean substrings (subject, body) for semantic indexing rather
94
+ # than the noisy full PDF prelude.
95
+ question_subject: str = "" # e.g. "ANNUAL INCOME OF SHGS"
96
+ question_stem: str = "" # e.g. "Will the Minister of RURAL DEVELOPMENT be pleased to state:"
97
+ question_body: str = "" # The actual (a)/(b)/(c)/(d) sub-questions
98
+ answer_minister_name: str = ""
99
+ answer_body: str = "" # Answer text with minister-name preamble stripped
100
+
101
+ def to_record(self) -> dict:
102
+ rec = {
103
+ "kind": "qa_response",
104
+ "question_text": self.question_text,
105
+ "answer_text": self.answer_text,
106
+ "confidence": self.confidence,
107
+ "extractor": self.extractor,
108
+ "boundary_marker": self.boundary_marker,
109
+ }
110
+ # Only emit structured fields when we actually parsed them, to
111
+ # avoid lying with empty-string defaults on legacy/edge records.
112
+ if self.question_subject:
113
+ rec["question_subject"] = self.question_subject
114
+ if self.question_stem:
115
+ rec["question_stem"] = self.question_stem
116
+ if self.question_body:
117
+ rec["question_body"] = self.question_body
118
+ if self.answer_minister_name:
119
+ rec["answer_minister_name"] = self.answer_minister_name
120
+ if self.answer_body:
121
+ rec["answer_body"] = self.answer_body
122
+ return rec
123
+
124
+
125
+ # -------------------------------------------------------------------------
126
+ # v0.6.5 — structured sub-extraction within Q/A halves
127
+ # -------------------------------------------------------------------------
128
+
129
+ # The "ANSWERED ON" date line is the most reliable boundary marker between
130
+ # the boilerplate header (GOVERNMENT OF INDIA / MINISTRY / DEPT / LOK SABHA
131
+ # / QUESTION NO. / ANSWERED ON DATE) and the substantive question content
132
+ # (subject line, asker, stem, body).
133
+ _ANSWERED_ON_RE = re.compile(
134
+ r"\bANSWERED\s+ON\b\s*[:.,]?\s*\d.*?$",
135
+ re.IGNORECASE | re.MULTILINE,
136
+ )
137
+
138
+ # The asker line: "1147. SHRI. <NAME>:" or
139
+ # "*123. SHRIMATI <NAME>:". Number + honorific + name + colon.
140
+ _ASKER_LINE_RE = re.compile(
141
+ r"^\s*\*?\s*\d+\.?\s+(?:SHRI|SHRIMATI|SMT|DR|KUMARI|PROF)"
142
+ r"[A-Z\.\s,]+:\s*$",
143
+ re.IGNORECASE | re.MULTILINE,
144
+ )
145
+
146
+ # The question stem: "Will the Minister of X be pleased to state:" or
147
+ # variants like "be pleased to refer to the answer given...". Anchored on
148
+ # "Will the Minister" so it doesn't catch other "Will" phrases.
149
+ _QUESTION_STEM_RE = re.compile(
150
+ r"\bWill\s+the\s+Minister\b.*?:",
151
+ re.IGNORECASE | re.DOTALL,
152
+ )
153
+
154
+ # Minister-name preamble in the answer half: "MINISTER OF STATE IN THE
155
+ # MINISTRY OF X (NAME)" or "THE MINISTER OF X: (NAME)". The name is in
156
+ # parentheses; we capture it.
157
+ #
158
+ # The bridge between "MINISTER OF X" and the captured "(NAME)" is bounded
159
+ # at ~250 chars so we don't accidentally walk into the answer body and
160
+ # capture a sub-item paren like "(a) The Ministry...". The captured name
161
+ # itself must be ≥4 chars and contain at least one space (so single-letter
162
+ # false positives like "(a)" are excluded — a real minister name is
163
+ # always at least "FIRST LAST").
164
+ _MINISTER_NAME_RE = re.compile(
165
+ r"\b(?:THE\s+)?MINISTER\s+(?:OF\s+STATE\s+(?:IN\s+THE\s+MINISTRY\s+OF|FOR)\b|OF\s+(?!STATE)|FOR)"
166
+ r"[^()]{0,250}?\((?P<name>[^()]{4,}?\s[^()]{1,})\)",
167
+ re.IGNORECASE | re.DOTALL,
168
+ )
169
+
170
+
171
+ def _parse_question_subject(question_text: str) -> str:
172
+ """Extract the all-caps subject line that appears between the
173
+ "ANSWERED ON" header and the asker line. Returns "" if not found.
174
+ """
175
+ after_date = _ANSWERED_ON_RE.search(question_text)
176
+ if not after_date:
177
+ return ""
178
+ body_after_date = question_text[after_date.end():]
179
+ # Find the asker line — subject sits between ANSWERED ON and asker.
180
+ asker_m = _ASKER_LINE_RE.search(body_after_date)
181
+ candidate = body_after_date[: asker_m.start()] if asker_m else body_after_date[:300]
182
+ # Subject lines are usually one all-caps phrase, possibly multi-line.
183
+ # Strip whitespace, drop empty lines, take the longest contiguous
184
+ # non-empty caps-or-mixed line block.
185
+ lines = [ln.strip() for ln in candidate.splitlines() if ln.strip()]
186
+ if not lines:
187
+ return ""
188
+ # Heuristic: the subject is the line(s) before any line starting with
189
+ # a digit (which would be the asker). Already handled by asker_m, so
190
+ # join all surviving lines.
191
+ return " ".join(lines).strip()[:200]
192
+
193
+
194
+ def _parse_question_stem_and_body(question_text: str) -> tuple[str, str]:
195
+ """Split the substantive question into stem ("Will the Minister of X
196
+ be pleased to state:") and body (the (a)/(b)/(c)/(d) sub-questions).
197
+ Returns ("", "") if no stem found.
198
+ """
199
+ m = _QUESTION_STEM_RE.search(question_text)
200
+ if not m:
201
+ return "", ""
202
+ stem = m.group(0).strip()
203
+ # Body is everything after the stem.
204
+ body = question_text[m.end():].strip()
205
+ # Strip the trailing question-mark of the (d) sub-question only if
206
+ # the body already ends with one — preserve question-marks within.
207
+ return stem, body
208
+
209
+
210
+ def _parse_answer_minister_and_body(answer_text: str) -> tuple[str, str]:
211
+ """Pull the minister's name out of the answer prelude and return the
212
+ cleaned answer body. Returns ("", answer_text) if no preamble found.
213
+ """
214
+ m = _MINISTER_NAME_RE.search(answer_text)
215
+ if not m:
216
+ return "", answer_text
217
+ name = (m.group("name") or "").strip()
218
+ body = answer_text[m.end():]
219
+ # Strip leftover punctuation immediately after the captured "(NAME)" —
220
+ # PDFs often have ":" and/or whitespace separating the prelude from
221
+ # the answer body. Without this, `body` starts with stray ":\n".
222
+ body = re.sub(r"^[\s:]+", "", body)
223
+ # Edge case: answers occasionally have *both* a State minister and a
224
+ # Cabinet minister listed in the prelude. The regex catches the first
225
+ # occurrence; that's fine — the second name and the answer text both
226
+ # remain in `body`, where they're still searchable.
227
+ return name, body
228
+
229
+
230
+ def split_qa(text: str) -> QaExtraction | None:
231
+ """Split a Q/A PDF's full text into question + answer halves, plus
232
+ structured sub-fields for semantic indexing.
233
+
234
+ Returns ``None`` when no recognisable boundary marker is found. Caller
235
+ decides what to do (skip; fall back to whole-text classification with
236
+ lower confidence).
237
+
238
+ The four structured sub-fields (``question_subject``, ``question_stem``,
239
+ ``question_body``, ``answer_minister_name``, ``answer_body``) are
240
+ best-effort: each parser returns an empty string when its anchor
241
+ isn't found, in which case ``to_record()`` omits that field rather
242
+ than emitting an empty placeholder. Legacy ``question_text`` and
243
+ ``answer_text`` are always populated when the function returns a
244
+ non-None result.
245
+ """
246
+ cleaned = _clean(text)
247
+ if not cleaned:
248
+ return None
249
+ m = _QA_REPLY_RE.search(cleaned)
250
+ if not m:
251
+ return None
252
+ question = cleaned[: m.start()].strip()
253
+ answer = cleaned[m.end():].strip()
254
+ if not question or not answer:
255
+ return None
256
+ subject = _parse_question_subject(question)
257
+ stem, body = _parse_question_stem_and_body(question)
258
+ minister, answer_body = _parse_answer_minister_and_body(answer)
259
+ return QaExtraction(
260
+ question_text=question,
261
+ answer_text=answer,
262
+ confidence=0.85 if len(answer) > 50 else 0.5,
263
+ boundary_marker=m.group(0).strip(),
264
+ question_subject=subject,
265
+ question_stem=stem,
266
+ question_body=body,
267
+ answer_minister_name=minister,
268
+ answer_body=answer_body,
269
+ )
270
+
271
+
272
+ # -------------------------------------------------------------------------
273
+ # ATR extractor
274
+ # -------------------------------------------------------------------------
275
+
276
+ # Recommendation markers — "Recommendation No. X", "Recommendation (Sl. No. X)",
277
+ # "Observation/Recommendation No. X", or "Recommendation \n 1.".
278
+ _ATR_REC_RE = re.compile(
279
+ r"(?:Observation\s*/\s*)?Recommendation\s*(?:\n|\s+)(?:No\.?|Sl\.?\s*No\.?|Serial\s*No\.?)\s*(\d+)"
280
+ r"|(?:\n|^)\s*Recommendation\s*\n\s*(\d+)\.",
281
+ re.IGNORECASE | re.MULTILINE,
282
+ )
283
+
284
+ # Reply markers — "Reply of the Government", "Action Taken by the Government",
285
+ # "Ministry's Reply", "Action Taken".
286
+ _ATR_REPLY_RE = re.compile(
287
+ r"(?:Reply\s+of\s+the\s+Government"
288
+ r"|Action\s+Taken\s+by\s+the\s+Government"
289
+ r"|Action\s+Taken"
290
+ r"|Ministry'?s\s+Reply"
291
+ r"|Comments\s+of\s+the\s+(?:Ministry|Government))",
292
+ re.IGNORECASE | re.MULTILINE,
293
+ )
294
+
295
+
296
+ @dataclass
297
+ class AtrExtraction:
298
+ recommendation_no: int
299
+ recommendation_text: str
300
+ response_text: str
301
+ confidence: float
302
+ extractor: str = EXTRACTOR_VERSION
303
+
304
+ def to_record(self) -> dict:
305
+ return {
306
+ "kind": "atr_response",
307
+ "recommendation_no": self.recommendation_no,
308
+ "recommendation_text": self.recommendation_text,
309
+ "response_text": self.response_text,
310
+ "confidence": self.confidence,
311
+ "extractor": self.extractor,
312
+ }
313
+
314
+
315
+ def split_atr(text: str) -> list[AtrExtraction]:
316
+ """Split an ATR PDF's text into (rec_no, rec_text, response_text) triples.
317
+
318
+ Returns an empty list if no recommendation markers are found.
319
+ """
320
+ cleaned = _clean(text)
321
+ if not cleaned:
322
+ return []
323
+ chunks = _ATR_REC_RE.split(cleaned)
324
+ # split() with N capture groups: [pre, g1, g2, ..., gn, body1, g1, g2, ..., gn, body2, ...]
325
+ num_groups = _ATR_REC_RE.groups
326
+ stride = num_groups + 1
327
+ if len(chunks) < stride + 1:
328
+ return []
329
+ out: list[AtrExtraction] = []
330
+ i = 1
331
+ while i < len(chunks) - stride + 1:
332
+ rec_no_raw = next((c for c in chunks[i:i + num_groups] if c), None)
333
+ try:
334
+ rec_no = int(rec_no_raw) if rec_no_raw else None
335
+ except (ValueError, TypeError):
336
+ rec_no = None
337
+
338
+ if rec_no is None:
339
+ i += stride
340
+ continue
341
+
342
+ body = chunks[i + num_groups] or ""
343
+ # Within the body, find the "Reply ..." boundary; everything before
344
+ # is the recommendation, everything after is the response.
345
+ reply_m = _ATR_REPLY_RE.search(body)
346
+ if reply_m:
347
+ rec_text = body[: reply_m.start()].strip()
348
+ resp_text = body[reply_m.end():].strip()
349
+ confidence = 0.9 if (rec_text and resp_text) else 0.5
350
+ else:
351
+ # Whole body becomes the recommendation; no reply found.
352
+ rec_text = body.strip()
353
+ resp_text = ""
354
+ confidence = 0.4
355
+ if rec_text:
356
+ out.append(AtrExtraction(
357
+ recommendation_no=rec_no,
358
+ recommendation_text=rec_text,
359
+ response_text=resp_text,
360
+ confidence=confidence,
361
+ ))
362
+ i += stride
363
+ return out
364
+
365
+
366
+ # -------------------------------------------------------------------------
367
+ # DFG / original committee report extractor
368
+ # -------------------------------------------------------------------------
369
+
370
+ # The recommendations section header. PDFs vary: "OBSERVATIONS/RECOMMENDATIONS
371
+ # OF THE COMMITTEE", "OBSERVATIONS / RECOMMENDATIONS", etc.
372
+ _DFG_SECTION_RE = re.compile(
373
+ r"OBSERVATIONS\s*/\s*RECOMMENDATIONS(?:\s+OF\s+THE\s+COMMITTEE)?",
374
+ re.IGNORECASE,
375
+ )
376
+
377
+ # Numbered paragraph: line starting with "<digit>." followed by whitespace.
378
+ # Non-line-anchored use is too greedy (matches in body text); we anchor to
379
+ # line start (after newline) and require the number-period-whitespace to
380
+ # start a new paragraph.
381
+ _DFG_PARA_RE = re.compile(r"(?:^|\n)\s*(\d+)\.\s+", re.MULTILINE)
382
+
383
+
384
+ @dataclass
385
+ class DfgExtraction:
386
+ recommendation_no: int
387
+ recommendation_text: str
388
+ confidence: float
389
+ extractor: str = EXTRACTOR_VERSION
390
+
391
+ def to_record(self) -> dict:
392
+ return {
393
+ "kind": "dfg_recommendation",
394
+ "recommendation_no": self.recommendation_no,
395
+ "recommendation_text": self.recommendation_text,
396
+ "confidence": self.confidence,
397
+ "extractor": self.extractor,
398
+ }
399
+
400
+
401
+ def split_dfg(text: str) -> list[DfgExtraction]:
402
+ """Find the recommendations section and split into numbered paragraphs.
403
+
404
+ DFG (Demands for Grants) reports list committee observations and
405
+ recommendations as numbered paragraphs in a dedicated section. Returns
406
+ an empty list if the section header isn't found.
407
+ """
408
+ if not text:
409
+ return []
410
+ # Find the LAST occurrence of the section header — the first occurrence
411
+ # is typically a TOC entry; the actual section appears later.
412
+ matches = list(_DFG_SECTION_RE.finditer(text))
413
+ if not matches:
414
+ return []
415
+ section_start = matches[-1].end()
416
+ section_text = text[section_start:]
417
+ # Cap at next major section heading or end of document. Common boundaries:
418
+ # "ANNEXURE", "MINUTES OF THE", "APPENDIX", "ADDENDUM".
419
+ end_match = re.search(
420
+ r"\n\s*(?:ANNEXURE|MINUTES\s+OF\s+THE|APPENDIX|ADDENDUM)\b",
421
+ section_text,
422
+ re.IGNORECASE,
423
+ )
424
+ if end_match:
425
+ section_text = section_text[: end_match.start()]
426
+
427
+ # Split on numbered-paragraph markers.
428
+ chunks = _DFG_PARA_RE.split(section_text)
429
+ if len(chunks) < 3:
430
+ return []
431
+ out: list[DfgExtraction] = []
432
+ i = 1
433
+ while i < len(chunks) - 1:
434
+ try:
435
+ rec_no = int(chunks[i])
436
+ except (ValueError, TypeError):
437
+ i += 2
438
+ continue
439
+ body = _clean(chunks[i + 1] or "")
440
+ if body:
441
+ out.append(DfgExtraction(
442
+ recommendation_no=rec_no,
443
+ recommendation_text=body,
444
+ confidence=0.8 if len(body) > 80 else 0.5,
445
+ ))
446
+ i += 2
447
+ return out
448
+
449
+
450
+ # -------------------------------------------------------------------------
451
+ # Corpus dispatcher
452
+ # -------------------------------------------------------------------------
453
+
454
+
455
+ @dataclass
456
+ class ExtractionStats:
457
+ qa_records: int = 0
458
+ atr_records: int = 0
459
+ dfg_records: int = 0
460
+ skipped_no_pdf: int = 0
461
+ skipped_no_text: int = 0
462
+ skipped_no_split: int = 0
463
+ sources_processed: int = 0
464
+ errors: list[dict] = field(default_factory=list)
465
+
466
+
467
+ def _classify_source(rec: dict) -> str:
468
+ """Decide which extractor applies to a manifest record.
469
+
470
+ Returns ``'qa'`` | ``'atr'`` | ``'observations'`` | ``'skip'``.
471
+
472
+ The ``'observations'`` bucket covers any committee report that
473
+ contains numbered Observations/Recommendations text — that's
474
+ Demands for Grants reports, Bill examinations, and Subject (own-
475
+ initiative) reports. They share a textual structure (numbered
476
+ paragraphs in a section heading variant of OBSERVATIONS /
477
+ RECOMMENDATIONS) so they share an extractor; the *source*
478
+ ``report_type`` from the manifest is preserved on each output
479
+ record as ``source_report_type`` so downstream filters can
480
+ distinguish them.
481
+
482
+ Prior to v0.6.3 this function returned ``'dfg'`` for everything
483
+ non-ATR, which mislabelled subject and bill reports as DFG.
484
+ """
485
+ kind = rec.get("kind") or ""
486
+ report_type = rec.get("report_type") or ""
487
+ if kind == "qa":
488
+ return "qa"
489
+ if kind == "committee_report":
490
+ if report_type == "action_taken":
491
+ return "atr"
492
+ # demands_for_grants | bill | subject | other (legacy "original")
493
+ # all dispatch to the observations extractor.
494
+ return "observations"
495
+ return "skip"
496
+
497
+
498
+ def _pdf_for_record(rec: dict, out_dir: Path) -> Path | None:
499
+ rel = rec.get("pdf_path")
500
+ if not rel:
501
+ return None
502
+ p = out_dir / rel
503
+ return p if p.exists() and p.stat().st_size > 1000 else None
504
+
505
+
506
+ def extract_answers(
507
+ out_dir: Path, *, refresh: bool = False, log_fn=print
508
+ ) -> ExtractionStats:
509
+ """Walk ``manifest.jsonl``, run the right extractor per record, write
510
+ ``answers.jsonl``. Returns stats for telemetry / CLI output.
511
+
512
+ Idempotent: ``answers.jsonl`` is overwritten, but the input
513
+ (``manifest.jsonl`` + downloaded PDFs) is unchanged. ``refresh=True``
514
+ forces re-extraction; otherwise existing ``answers.jsonl`` is replaced
515
+ with current parser output.
516
+ """
517
+ stats = ExtractionStats()
518
+ manifest_path = out_dir / "manifest.jsonl"
519
+ out_path = out_dir / "answers.jsonl"
520
+ records = read_jsonl(manifest_path)
521
+ out_dir.mkdir(parents=True, exist_ok=True)
522
+
523
+ out_records: list[dict] = []
524
+ for rec in records:
525
+ kind = _classify_source(rec)
526
+ if kind == "skip":
527
+ continue
528
+ stats.sources_processed += 1
529
+ pdf = _pdf_for_record(rec, out_dir)
530
+ if not pdf:
531
+ stats.skipped_no_pdf += 1
532
+ continue
533
+ try:
534
+ text = extract_pdf_text(pdf)
535
+ except Exception as exc: # noqa: BLE001
536
+ stats.errors.append({"key": rec.get("key"), "where": "pdftotext", "error": repr(exc)})
537
+ continue
538
+ if not text or not text.strip():
539
+ stats.skipped_no_text += 1
540
+ continue
541
+
542
+ common = {
543
+ "key": rec.get("key"),
544
+ "run_id": rec.get("run_id"),
545
+ "source_pdf": str(pdf.relative_to(out_dir)),
546
+ "extracted_at": _now(),
547
+ "language_classified": ["en"],
548
+ # Carry the manifest's report_type forward so downstream
549
+ # consumers can distinguish numbered observations that came
550
+ # from a Demands-for-Grants vs Bill vs Subject report. Prior
551
+ # to v0.6.3 this distinction was not surfaced and all non-ATR
552
+ # records were tagged 'dfg_recommendation' regardless of source.
553
+ "source_report_type": rec.get("report_type"),
554
+ }
555
+
556
+ try:
557
+ if kind == "qa":
558
+ qa = split_qa(text)
559
+ if qa is None:
560
+ stats.skipped_no_split += 1
561
+ continue
562
+ out_records.append({**common, **qa.to_record()})
563
+ stats.qa_records += 1
564
+ elif kind == "atr":
565
+ items = split_atr(text)
566
+ if not items:
567
+ stats.skipped_no_split += 1
568
+ continue
569
+ for item in items:
570
+ out_records.append({**common, **item.to_record()})
571
+ stats.atr_records += len(items)
572
+ elif kind == "observations":
573
+ items = split_dfg(text)
574
+ if not items:
575
+ stats.skipped_no_split += 1
576
+ continue
577
+ for item in items:
578
+ out_records.append({**common, **item.to_record()})
579
+ stats.dfg_records += len(items)
580
+ except Exception as exc: # noqa: BLE001
581
+ stats.errors.append({"key": rec.get("key"), "where": kind, "error": repr(exc)})
582
+
583
+ # Write atomically: write to a sibling temp then rename. Use ``with_name``
584
+ # rather than ``with_suffix`` because ``Path("answers.jsonl").with_suffix(
585
+ # ".jsonl.tmp")`` is ambiguous across pathlib versions.
586
+ tmp = out_path.with_name(out_path.name + ".tmp")
587
+ with tmp.open("w", encoding="utf-8") as f:
588
+ for row in out_records:
589
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
590
+ tmp.replace(out_path)
591
+
592
+ log_fn(
593
+ f"answers.jsonl: qa={stats.qa_records} atr={stats.atr_records} "
594
+ f"dfg={stats.dfg_records} skipped_no_pdf={stats.skipped_no_pdf} "
595
+ f"skipped_no_text={stats.skipped_no_text} skipped_no_split={stats.skipped_no_split} "
596
+ f"errors={len(stats.errors)}"
597
+ )
598
+ return stats