commoner-probe 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- commoner_probe/__init__.py +62 -0
- commoner_probe/__main__.py +5 -0
- commoner_probe/answers.py +598 -0
- commoner_probe/atr_linkage.py +275 -0
- commoner_probe/base.py +169 -0
- commoner_probe/cli.py +466 -0
- commoner_probe/committees.py +603 -0
- commoner_probe/corpus.py +312 -0
- commoner_probe/csr/__init__.py +6 -0
- commoner_probe/csr/mca.py +178 -0
- commoner_probe/dmft/__init__.py +3 -0
- commoner_probe/dmft/mines.py +238 -0
- commoner_probe/entities.py +440 -0
- commoner_probe/evidence.py +250 -0
- commoner_probe/example_topics/__init__.py +23 -0
- commoner_probe/example_topics/affirmative_action.json +82 -0
- commoner_probe/example_topics/home_affairs_starred.json +31 -0
- commoner_probe/example_topics/libraries.json +66 -0
- commoner_probe/example_topics/mines_dmft_pmkkky.json +26 -0
- commoner_probe/example_topics/narcotics_substance.json +44 -0
- commoner_probe/http_client.py +206 -0
- commoner_probe/members.py +127 -0
- commoner_probe/neva.py +663 -0
- commoner_probe/records.py +350 -0
- commoner_probe/resolver.py +169 -0
- commoner_probe/runlog.py +189 -0
- commoner_probe/sansad.py +469 -0
- commoner_probe/schemas/__init__.py +60 -0
- commoner_probe/schemas/answers_atr_response.schema.json +22 -0
- commoner_probe/schemas/answers_dfg_recommendation.schema.json +21 -0
- commoner_probe/schemas/answers_qa_response.schema.json +27 -0
- commoner_probe/schemas/atr_linkage.schema.json +19 -0
- commoner_probe/schemas/committee_members.schema.json +43 -0
- commoner_probe/schemas/entities_bureaucratic_posting.schema.json +19 -0
- commoner_probe/schemas/entities_committee_membership.schema.json +18 -0
- commoner_probe/schemas/entities_ministerial_appointment.schema.json +17 -0
- commoner_probe/schemas/entities_mp_membership.schema.json +20 -0
- commoner_probe/schemas/entities_person.schema.json +16 -0
- commoner_probe/schemas/manifest_committee_report.schema.json +43 -0
- commoner_probe/schemas/manifest_mca_csr.schema.json +72 -0
- commoner_probe/schemas/manifest_mines_dmft.schema.json +94 -0
- commoner_probe/schemas/manifest_qa.schema.json +58 -0
- commoner_probe/schemas/runs.schema.json +39 -0
- commoner_probe/schemas/state_assembly_member.schema.json +64 -0
- commoner_probe/schemas/state_assembly_paper_laid.schema.json +24 -0
- commoner_probe/schemas/state_assembly_question.schema.json +84 -0
- commoner_probe/schemas/state_assembly_question_unlisted.schema.json +84 -0
- commoner_probe/stats.py +235 -0
- commoner_probe/textparse.py +79 -0
- commoner_probe/topics.py +44 -0
- commoner_probe/url_safety.py +82 -0
- commoner_probe/validate.py +213 -0
- commoner_probe-0.4.0.dist-info/METADATA +531 -0
- commoner_probe-0.4.0.dist-info/RECORD +58 -0
- commoner_probe-0.4.0.dist-info/WHEEL +5 -0
- commoner_probe-0.4.0.dist-info/entry_points.txt +2 -0
- commoner_probe-0.4.0.dist-info/licenses/LICENSE +21 -0
- commoner_probe-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
"""Data-pulling crawler for Indian Parliament question corpora."""
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from . import schemas as schemas
|
|
8
|
+
from .corpus import AtrChain, Corpus, QaPair # noqa: F401
|
|
9
|
+
from .records import ( # noqa: F401
|
|
10
|
+
AnswerAtrResponse,
|
|
11
|
+
AnswerDfgRecommendation,
|
|
12
|
+
AnswerQaResponse,
|
|
13
|
+
AtrLinkageRecord,
|
|
14
|
+
BureaucraticPosting,
|
|
15
|
+
CommitteeMembership,
|
|
16
|
+
ManifestCommitteeReportRecord,
|
|
17
|
+
ManifestMinesDmftRecord,
|
|
18
|
+
ManifestQaRecord,
|
|
19
|
+
MinisterialAppointment,
|
|
20
|
+
MpMembership,
|
|
21
|
+
Person,
|
|
22
|
+
RunRecord,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"__version__",
|
|
27
|
+
"schemas",
|
|
28
|
+
"Corpus",
|
|
29
|
+
"QaPair",
|
|
30
|
+
"AtrChain",
|
|
31
|
+
"ManifestQaRecord",
|
|
32
|
+
"ManifestCommitteeReportRecord",
|
|
33
|
+
"ManifestMinesDmftRecord",
|
|
34
|
+
"AnswerQaResponse",
|
|
35
|
+
"AnswerAtrResponse",
|
|
36
|
+
"AnswerDfgRecommendation",
|
|
37
|
+
"AtrLinkageRecord",
|
|
38
|
+
"RunRecord",
|
|
39
|
+
"Person",
|
|
40
|
+
"MpMembership",
|
|
41
|
+
"CommitteeMembership",
|
|
42
|
+
"MinisterialAppointment",
|
|
43
|
+
"BureaucraticPosting",
|
|
44
|
+
]
|
|
45
|
+
def _version_from_pyproject() -> str | None:
|
|
46
|
+
"""Fallback for source-tree runs before the package is installed."""
|
|
47
|
+
pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
|
|
48
|
+
if not pyproject.exists():
|
|
49
|
+
return None
|
|
50
|
+
match = re.search(r'^version = "([^"]+)"$', pyproject.read_text(encoding="utf-8"), re.MULTILINE)
|
|
51
|
+
return match.group(1) if match else None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _resolve_version() -> str:
|
|
55
|
+
try:
|
|
56
|
+
from importlib.metadata import version as _dist_version
|
|
57
|
+
return _dist_version("commoner-probe")
|
|
58
|
+
except Exception:
|
|
59
|
+
return _version_from_pyproject() or "0.0.0"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
__version__ = _resolve_version()
|
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
"""Phase 1: structured text extraction from parliamentary PDFs.
|
|
3
|
+
|
|
4
|
+
Three shapes of source PDF, three extractors:
|
|
5
|
+
|
|
6
|
+
* **Q/A PDFs** (parliamentary questions): split a single
|
|
7
|
+
question+answer into ``(question_text, answer_text)`` on the
|
|
8
|
+
"Reply by ..." / "Answer" boundary. Output: one record per source.
|
|
9
|
+
|
|
10
|
+
* **ATR PDFs** (Action-Taken Reports — government's response to a
|
|
11
|
+
prior committee report): split into
|
|
12
|
+
``[(recommendation_no, recommendation_text, response_text), ...]``
|
|
13
|
+
on "Recommendation No. X" / "Reply of the Government" boundaries.
|
|
14
|
+
One source PDF → many records.
|
|
15
|
+
|
|
16
|
+
* **DFG / original committee reports**: find the
|
|
17
|
+
``OBSERVATIONS/RECOMMENDATIONS OF THE COMMITTEE`` section, split on
|
|
18
|
+
numbered paragraphs. Output: ``[(recommendation_no,
|
|
19
|
+
recommendation_text), ...]`` — no response text because the
|
|
20
|
+
executive hasn't replied yet (that arrives in a future ATR).
|
|
21
|
+
|
|
22
|
+
This module is **extraction only**. Classification (counterinsurgency
|
|
23
|
+
labels) is Phase 2 in ``discourse.py``.
|
|
24
|
+
|
|
25
|
+
Schema commitments for ``answers.jsonl`` are documented in the file's
|
|
26
|
+
header comments and in `notes/PLAN_v0.5.0_SCOPE.md`.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import json
|
|
32
|
+
import re
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from datetime import datetime, timezone
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
from .textparse import extract_pdf_text, read_jsonl
|
|
38
|
+
|
|
39
|
+
EXTRACTOR_VERSION = "answers_regex_v1"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _now() -> str:
|
|
43
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _clean(text: str) -> str:
|
|
47
|
+
"""Normalise PDF whitespace artefacts: collapse runs of spaces, strip
|
|
48
|
+
page-number boilerplate, but preserve paragraph boundaries.
|
|
49
|
+
"""
|
|
50
|
+
if not text:
|
|
51
|
+
return ""
|
|
52
|
+
# Strip page numbers that often appear as standalone numeric lines.
|
|
53
|
+
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
|
|
54
|
+
# Drop "(Para 2.15)" style cross-references — they're metadata, not text.
|
|
55
|
+
text = re.sub(r"\(Para\s+\d+(?:\.\d+)*\)", "", text)
|
|
56
|
+
# Collapse spaces within lines but keep newlines (paragraph structure).
|
|
57
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
58
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
59
|
+
return text.strip()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# -------------------------------------------------------------------------
|
|
63
|
+
# Q/A extractor
|
|
64
|
+
# -------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
# Markers indicating the boundary between question text and answer text in
|
|
67
|
+
# parliamentary Q/A PDFs. Order matters — earlier patterns are more
|
|
68
|
+
# specific and tried first.
|
|
69
|
+
_QA_REPLY_PATTERNS = [
|
|
70
|
+
r"^\s*ANSWER\s*$", # Bare "ANSWER" header line
|
|
71
|
+
r"^\s*REPLY\s*$", # Bare "REPLY"
|
|
72
|
+
r"\bTO\s+BE\s+ANSWERED\s+ON\b", # Often followed by date, then answer
|
|
73
|
+
r"^\s*Reply\s+by\b.{0,200}?:", # "Reply by [Minister name]:"
|
|
74
|
+
r"^\s*Answer\s+by\b.{0,200}?:",
|
|
75
|
+
r"\bSHRI\b.{0,60}\b(?:MINISTER|MOS)\b", # "SHRI X, MINISTER OF Y"
|
|
76
|
+
]
|
|
77
|
+
_QA_REPLY_RE = re.compile(
|
|
78
|
+
"|".join(f"({p})" for p in _QA_REPLY_PATTERNS),
|
|
79
|
+
re.IGNORECASE | re.MULTILINE,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class QaExtraction:
|
|
85
|
+
question_text: str # Full raw question half (incl. boilerplate)
|
|
86
|
+
answer_text: str # Full raw answer half (incl. minister preamble)
|
|
87
|
+
confidence: float
|
|
88
|
+
extractor: str = EXTRACTOR_VERSION
|
|
89
|
+
boundary_marker: str = ""
|
|
90
|
+
# v0.6.5 — structured sub-fields derived from question_text/answer_text.
|
|
91
|
+
# Additive: legacy consumers reading question_text / answer_text are
|
|
92
|
+
# unaffected. Designed for the v0.7.0 mp-draft bridge feature, which
|
|
93
|
+
# needs clean substrings (subject, body) for semantic indexing rather
|
|
94
|
+
# than the noisy full PDF prelude.
|
|
95
|
+
question_subject: str = "" # e.g. "ANNUAL INCOME OF SHGS"
|
|
96
|
+
question_stem: str = "" # e.g. "Will the Minister of RURAL DEVELOPMENT be pleased to state:"
|
|
97
|
+
question_body: str = "" # The actual (a)/(b)/(c)/(d) sub-questions
|
|
98
|
+
answer_minister_name: str = ""
|
|
99
|
+
answer_body: str = "" # Answer text with minister-name preamble stripped
|
|
100
|
+
|
|
101
|
+
def to_record(self) -> dict:
|
|
102
|
+
rec = {
|
|
103
|
+
"kind": "qa_response",
|
|
104
|
+
"question_text": self.question_text,
|
|
105
|
+
"answer_text": self.answer_text,
|
|
106
|
+
"confidence": self.confidence,
|
|
107
|
+
"extractor": self.extractor,
|
|
108
|
+
"boundary_marker": self.boundary_marker,
|
|
109
|
+
}
|
|
110
|
+
# Only emit structured fields when we actually parsed them, to
|
|
111
|
+
# avoid lying with empty-string defaults on legacy/edge records.
|
|
112
|
+
if self.question_subject:
|
|
113
|
+
rec["question_subject"] = self.question_subject
|
|
114
|
+
if self.question_stem:
|
|
115
|
+
rec["question_stem"] = self.question_stem
|
|
116
|
+
if self.question_body:
|
|
117
|
+
rec["question_body"] = self.question_body
|
|
118
|
+
if self.answer_minister_name:
|
|
119
|
+
rec["answer_minister_name"] = self.answer_minister_name
|
|
120
|
+
if self.answer_body:
|
|
121
|
+
rec["answer_body"] = self.answer_body
|
|
122
|
+
return rec
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# -------------------------------------------------------------------------
|
|
126
|
+
# v0.6.5 — structured sub-extraction within Q/A halves
|
|
127
|
+
# -------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
# The "ANSWERED ON" date line is the most reliable boundary marker between
|
|
130
|
+
# the boilerplate header (GOVERNMENT OF INDIA / MINISTRY / DEPT / LOK SABHA
|
|
131
|
+
# / QUESTION NO. / ANSWERED ON DATE) and the substantive question content
|
|
132
|
+
# (subject line, asker, stem, body).
|
|
133
|
+
_ANSWERED_ON_RE = re.compile(
|
|
134
|
+
r"\bANSWERED\s+ON\b\s*[:.,]?\s*\d.*?$",
|
|
135
|
+
re.IGNORECASE | re.MULTILINE,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# The asker line: "1147. SHRI. <NAME>:" or
|
|
139
|
+
# "*123. SHRIMATI <NAME>:". Number + honorific + name + colon.
|
|
140
|
+
_ASKER_LINE_RE = re.compile(
|
|
141
|
+
r"^\s*\*?\s*\d+\.?\s+(?:SHRI|SHRIMATI|SMT|DR|KUMARI|PROF)"
|
|
142
|
+
r"[A-Z\.\s,]+:\s*$",
|
|
143
|
+
re.IGNORECASE | re.MULTILINE,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# The question stem: "Will the Minister of X be pleased to state:" or
|
|
147
|
+
# variants like "be pleased to refer to the answer given...". Anchored on
|
|
148
|
+
# "Will the Minister" so it doesn't catch other "Will" phrases.
|
|
149
|
+
_QUESTION_STEM_RE = re.compile(
|
|
150
|
+
r"\bWill\s+the\s+Minister\b.*?:",
|
|
151
|
+
re.IGNORECASE | re.DOTALL,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Minister-name preamble in the answer half: "MINISTER OF STATE IN THE
|
|
155
|
+
# MINISTRY OF X (NAME)" or "THE MINISTER OF X: (NAME)". The name is in
|
|
156
|
+
# parentheses; we capture it.
|
|
157
|
+
#
|
|
158
|
+
# The bridge between "MINISTER OF X" and the captured "(NAME)" is bounded
|
|
159
|
+
# at ~250 chars so we don't accidentally walk into the answer body and
|
|
160
|
+
# capture a sub-item paren like "(a) The Ministry...". The captured name
|
|
161
|
+
# itself must be ≥4 chars and contain at least one space (so single-letter
|
|
162
|
+
# false positives like "(a)" are excluded — a real minister name is
|
|
163
|
+
# always at least "FIRST LAST").
|
|
164
|
+
_MINISTER_NAME_RE = re.compile(
|
|
165
|
+
r"\b(?:THE\s+)?MINISTER\s+(?:OF\s+STATE\s+(?:IN\s+THE\s+MINISTRY\s+OF|FOR)\b|OF\s+(?!STATE)|FOR)"
|
|
166
|
+
r"[^()]{0,250}?\((?P<name>[^()]{4,}?\s[^()]{1,})\)",
|
|
167
|
+
re.IGNORECASE | re.DOTALL,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _parse_question_subject(question_text: str) -> str:
|
|
172
|
+
"""Extract the all-caps subject line that appears between the
|
|
173
|
+
"ANSWERED ON" header and the asker line. Returns "" if not found.
|
|
174
|
+
"""
|
|
175
|
+
after_date = _ANSWERED_ON_RE.search(question_text)
|
|
176
|
+
if not after_date:
|
|
177
|
+
return ""
|
|
178
|
+
body_after_date = question_text[after_date.end():]
|
|
179
|
+
# Find the asker line — subject sits between ANSWERED ON and asker.
|
|
180
|
+
asker_m = _ASKER_LINE_RE.search(body_after_date)
|
|
181
|
+
candidate = body_after_date[: asker_m.start()] if asker_m else body_after_date[:300]
|
|
182
|
+
# Subject lines are usually one all-caps phrase, possibly multi-line.
|
|
183
|
+
# Strip whitespace, drop empty lines, take the longest contiguous
|
|
184
|
+
# non-empty caps-or-mixed line block.
|
|
185
|
+
lines = [ln.strip() for ln in candidate.splitlines() if ln.strip()]
|
|
186
|
+
if not lines:
|
|
187
|
+
return ""
|
|
188
|
+
# Heuristic: the subject is the line(s) before any line starting with
|
|
189
|
+
# a digit (which would be the asker). Already handled by asker_m, so
|
|
190
|
+
# join all surviving lines.
|
|
191
|
+
return " ".join(lines).strip()[:200]
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _parse_question_stem_and_body(question_text: str) -> tuple[str, str]:
|
|
195
|
+
"""Split the substantive question into stem ("Will the Minister of X
|
|
196
|
+
be pleased to state:") and body (the (a)/(b)/(c)/(d) sub-questions).
|
|
197
|
+
Returns ("", "") if no stem found.
|
|
198
|
+
"""
|
|
199
|
+
m = _QUESTION_STEM_RE.search(question_text)
|
|
200
|
+
if not m:
|
|
201
|
+
return "", ""
|
|
202
|
+
stem = m.group(0).strip()
|
|
203
|
+
# Body is everything after the stem.
|
|
204
|
+
body = question_text[m.end():].strip()
|
|
205
|
+
# Strip the trailing question-mark of the (d) sub-question only if
|
|
206
|
+
# the body already ends with one — preserve question-marks within.
|
|
207
|
+
return stem, body
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _parse_answer_minister_and_body(answer_text: str) -> tuple[str, str]:
|
|
211
|
+
"""Pull the minister's name out of the answer prelude and return the
|
|
212
|
+
cleaned answer body. Returns ("", answer_text) if no preamble found.
|
|
213
|
+
"""
|
|
214
|
+
m = _MINISTER_NAME_RE.search(answer_text)
|
|
215
|
+
if not m:
|
|
216
|
+
return "", answer_text
|
|
217
|
+
name = (m.group("name") or "").strip()
|
|
218
|
+
body = answer_text[m.end():]
|
|
219
|
+
# Strip leftover punctuation immediately after the captured "(NAME)" —
|
|
220
|
+
# PDFs often have ":" and/or whitespace separating the prelude from
|
|
221
|
+
# the answer body. Without this, `body` starts with stray ":\n".
|
|
222
|
+
body = re.sub(r"^[\s:]+", "", body)
|
|
223
|
+
# Edge case: answers occasionally have *both* a State minister and a
|
|
224
|
+
# Cabinet minister listed in the prelude. The regex catches the first
|
|
225
|
+
# occurrence; that's fine — the second name and the answer text both
|
|
226
|
+
# remain in `body`, where they're still searchable.
|
|
227
|
+
return name, body
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def split_qa(text: str) -> QaExtraction | None:
|
|
231
|
+
"""Split a Q/A PDF's full text into question + answer halves, plus
|
|
232
|
+
structured sub-fields for semantic indexing.
|
|
233
|
+
|
|
234
|
+
Returns ``None`` when no recognisable boundary marker is found. Caller
|
|
235
|
+
decides what to do (skip; fall back to whole-text classification with
|
|
236
|
+
lower confidence).
|
|
237
|
+
|
|
238
|
+
The four structured sub-fields (``question_subject``, ``question_stem``,
|
|
239
|
+
``question_body``, ``answer_minister_name``, ``answer_body``) are
|
|
240
|
+
best-effort: each parser returns an empty string when its anchor
|
|
241
|
+
isn't found, in which case ``to_record()`` omits that field rather
|
|
242
|
+
than emitting an empty placeholder. Legacy ``question_text`` and
|
|
243
|
+
``answer_text`` are always populated when the function returns a
|
|
244
|
+
non-None result.
|
|
245
|
+
"""
|
|
246
|
+
cleaned = _clean(text)
|
|
247
|
+
if not cleaned:
|
|
248
|
+
return None
|
|
249
|
+
m = _QA_REPLY_RE.search(cleaned)
|
|
250
|
+
if not m:
|
|
251
|
+
return None
|
|
252
|
+
question = cleaned[: m.start()].strip()
|
|
253
|
+
answer = cleaned[m.end():].strip()
|
|
254
|
+
if not question or not answer:
|
|
255
|
+
return None
|
|
256
|
+
subject = _parse_question_subject(question)
|
|
257
|
+
stem, body = _parse_question_stem_and_body(question)
|
|
258
|
+
minister, answer_body = _parse_answer_minister_and_body(answer)
|
|
259
|
+
return QaExtraction(
|
|
260
|
+
question_text=question,
|
|
261
|
+
answer_text=answer,
|
|
262
|
+
confidence=0.85 if len(answer) > 50 else 0.5,
|
|
263
|
+
boundary_marker=m.group(0).strip(),
|
|
264
|
+
question_subject=subject,
|
|
265
|
+
question_stem=stem,
|
|
266
|
+
question_body=body,
|
|
267
|
+
answer_minister_name=minister,
|
|
268
|
+
answer_body=answer_body,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# -------------------------------------------------------------------------
|
|
273
|
+
# ATR extractor
|
|
274
|
+
# -------------------------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
# Recommendation markers — "Recommendation No. X", "Recommendation (Sl. No. X)",
|
|
277
|
+
# "Observation/Recommendation No. X", or "Recommendation \n 1.".
|
|
278
|
+
_ATR_REC_RE = re.compile(
|
|
279
|
+
r"(?:Observation\s*/\s*)?Recommendation\s*(?:\n|\s+)(?:No\.?|Sl\.?\s*No\.?|Serial\s*No\.?)\s*(\d+)"
|
|
280
|
+
r"|(?:\n|^)\s*Recommendation\s*\n\s*(\d+)\.",
|
|
281
|
+
re.IGNORECASE | re.MULTILINE,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Reply markers — "Reply of the Government", "Action Taken by the Government",
|
|
285
|
+
# "Ministry's Reply", "Action Taken".
|
|
286
|
+
_ATR_REPLY_RE = re.compile(
|
|
287
|
+
r"(?:Reply\s+of\s+the\s+Government"
|
|
288
|
+
r"|Action\s+Taken\s+by\s+the\s+Government"
|
|
289
|
+
r"|Action\s+Taken"
|
|
290
|
+
r"|Ministry'?s\s+Reply"
|
|
291
|
+
r"|Comments\s+of\s+the\s+(?:Ministry|Government))",
|
|
292
|
+
re.IGNORECASE | re.MULTILINE,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@dataclass
|
|
297
|
+
class AtrExtraction:
|
|
298
|
+
recommendation_no: int
|
|
299
|
+
recommendation_text: str
|
|
300
|
+
response_text: str
|
|
301
|
+
confidence: float
|
|
302
|
+
extractor: str = EXTRACTOR_VERSION
|
|
303
|
+
|
|
304
|
+
def to_record(self) -> dict:
|
|
305
|
+
return {
|
|
306
|
+
"kind": "atr_response",
|
|
307
|
+
"recommendation_no": self.recommendation_no,
|
|
308
|
+
"recommendation_text": self.recommendation_text,
|
|
309
|
+
"response_text": self.response_text,
|
|
310
|
+
"confidence": self.confidence,
|
|
311
|
+
"extractor": self.extractor,
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def split_atr(text: str) -> list[AtrExtraction]:
|
|
316
|
+
"""Split an ATR PDF's text into (rec_no, rec_text, response_text) triples.
|
|
317
|
+
|
|
318
|
+
Returns an empty list if no recommendation markers are found.
|
|
319
|
+
"""
|
|
320
|
+
cleaned = _clean(text)
|
|
321
|
+
if not cleaned:
|
|
322
|
+
return []
|
|
323
|
+
chunks = _ATR_REC_RE.split(cleaned)
|
|
324
|
+
# split() with N capture groups: [pre, g1, g2, ..., gn, body1, g1, g2, ..., gn, body2, ...]
|
|
325
|
+
num_groups = _ATR_REC_RE.groups
|
|
326
|
+
stride = num_groups + 1
|
|
327
|
+
if len(chunks) < stride + 1:
|
|
328
|
+
return []
|
|
329
|
+
out: list[AtrExtraction] = []
|
|
330
|
+
i = 1
|
|
331
|
+
while i < len(chunks) - stride + 1:
|
|
332
|
+
rec_no_raw = next((c for c in chunks[i:i + num_groups] if c), None)
|
|
333
|
+
try:
|
|
334
|
+
rec_no = int(rec_no_raw) if rec_no_raw else None
|
|
335
|
+
except (ValueError, TypeError):
|
|
336
|
+
rec_no = None
|
|
337
|
+
|
|
338
|
+
if rec_no is None:
|
|
339
|
+
i += stride
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
body = chunks[i + num_groups] or ""
|
|
343
|
+
# Within the body, find the "Reply ..." boundary; everything before
|
|
344
|
+
# is the recommendation, everything after is the response.
|
|
345
|
+
reply_m = _ATR_REPLY_RE.search(body)
|
|
346
|
+
if reply_m:
|
|
347
|
+
rec_text = body[: reply_m.start()].strip()
|
|
348
|
+
resp_text = body[reply_m.end():].strip()
|
|
349
|
+
confidence = 0.9 if (rec_text and resp_text) else 0.5
|
|
350
|
+
else:
|
|
351
|
+
# Whole body becomes the recommendation; no reply found.
|
|
352
|
+
rec_text = body.strip()
|
|
353
|
+
resp_text = ""
|
|
354
|
+
confidence = 0.4
|
|
355
|
+
if rec_text:
|
|
356
|
+
out.append(AtrExtraction(
|
|
357
|
+
recommendation_no=rec_no,
|
|
358
|
+
recommendation_text=rec_text,
|
|
359
|
+
response_text=resp_text,
|
|
360
|
+
confidence=confidence,
|
|
361
|
+
))
|
|
362
|
+
i += stride
|
|
363
|
+
return out
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# -------------------------------------------------------------------------
|
|
367
|
+
# DFG / original committee report extractor
|
|
368
|
+
# -------------------------------------------------------------------------
|
|
369
|
+
|
|
370
|
+
# The recommendations section header. PDFs vary: "OBSERVATIONS/RECOMMENDATIONS
|
|
371
|
+
# OF THE COMMITTEE", "OBSERVATIONS / RECOMMENDATIONS", etc.
|
|
372
|
+
_DFG_SECTION_RE = re.compile(
|
|
373
|
+
r"OBSERVATIONS\s*/\s*RECOMMENDATIONS(?:\s+OF\s+THE\s+COMMITTEE)?",
|
|
374
|
+
re.IGNORECASE,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Numbered paragraph: line starting with "<digit>." followed by whitespace.
|
|
378
|
+
# Non-line-anchored use is too greedy (matches in body text); we anchor to
|
|
379
|
+
# line start (after newline) and require the number-period-whitespace to
|
|
380
|
+
# start a new paragraph.
|
|
381
|
+
_DFG_PARA_RE = re.compile(r"(?:^|\n)\s*(\d+)\.\s+", re.MULTILINE)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
@dataclass
|
|
385
|
+
class DfgExtraction:
|
|
386
|
+
recommendation_no: int
|
|
387
|
+
recommendation_text: str
|
|
388
|
+
confidence: float
|
|
389
|
+
extractor: str = EXTRACTOR_VERSION
|
|
390
|
+
|
|
391
|
+
def to_record(self) -> dict:
|
|
392
|
+
return {
|
|
393
|
+
"kind": "dfg_recommendation",
|
|
394
|
+
"recommendation_no": self.recommendation_no,
|
|
395
|
+
"recommendation_text": self.recommendation_text,
|
|
396
|
+
"confidence": self.confidence,
|
|
397
|
+
"extractor": self.extractor,
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def split_dfg(text: str) -> list[DfgExtraction]:
|
|
402
|
+
"""Find the recommendations section and split into numbered paragraphs.
|
|
403
|
+
|
|
404
|
+
DFG (Demands for Grants) reports list committee observations and
|
|
405
|
+
recommendations as numbered paragraphs in a dedicated section. Returns
|
|
406
|
+
an empty list if the section header isn't found.
|
|
407
|
+
"""
|
|
408
|
+
if not text:
|
|
409
|
+
return []
|
|
410
|
+
# Find the LAST occurrence of the section header — the first occurrence
|
|
411
|
+
# is typically a TOC entry; the actual section appears later.
|
|
412
|
+
matches = list(_DFG_SECTION_RE.finditer(text))
|
|
413
|
+
if not matches:
|
|
414
|
+
return []
|
|
415
|
+
section_start = matches[-1].end()
|
|
416
|
+
section_text = text[section_start:]
|
|
417
|
+
# Cap at next major section heading or end of document. Common boundaries:
|
|
418
|
+
# "ANNEXURE", "MINUTES OF THE", "APPENDIX", "ADDENDUM".
|
|
419
|
+
end_match = re.search(
|
|
420
|
+
r"\n\s*(?:ANNEXURE|MINUTES\s+OF\s+THE|APPENDIX|ADDENDUM)\b",
|
|
421
|
+
section_text,
|
|
422
|
+
re.IGNORECASE,
|
|
423
|
+
)
|
|
424
|
+
if end_match:
|
|
425
|
+
section_text = section_text[: end_match.start()]
|
|
426
|
+
|
|
427
|
+
# Split on numbered-paragraph markers.
|
|
428
|
+
chunks = _DFG_PARA_RE.split(section_text)
|
|
429
|
+
if len(chunks) < 3:
|
|
430
|
+
return []
|
|
431
|
+
out: list[DfgExtraction] = []
|
|
432
|
+
i = 1
|
|
433
|
+
while i < len(chunks) - 1:
|
|
434
|
+
try:
|
|
435
|
+
rec_no = int(chunks[i])
|
|
436
|
+
except (ValueError, TypeError):
|
|
437
|
+
i += 2
|
|
438
|
+
continue
|
|
439
|
+
body = _clean(chunks[i + 1] or "")
|
|
440
|
+
if body:
|
|
441
|
+
out.append(DfgExtraction(
|
|
442
|
+
recommendation_no=rec_no,
|
|
443
|
+
recommendation_text=body,
|
|
444
|
+
confidence=0.8 if len(body) > 80 else 0.5,
|
|
445
|
+
))
|
|
446
|
+
i += 2
|
|
447
|
+
return out
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
# -------------------------------------------------------------------------
|
|
451
|
+
# Corpus dispatcher
|
|
452
|
+
# -------------------------------------------------------------------------
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
@dataclass
|
|
456
|
+
class ExtractionStats:
|
|
457
|
+
qa_records: int = 0
|
|
458
|
+
atr_records: int = 0
|
|
459
|
+
dfg_records: int = 0
|
|
460
|
+
skipped_no_pdf: int = 0
|
|
461
|
+
skipped_no_text: int = 0
|
|
462
|
+
skipped_no_split: int = 0
|
|
463
|
+
sources_processed: int = 0
|
|
464
|
+
errors: list[dict] = field(default_factory=list)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _classify_source(rec: dict) -> str:
|
|
468
|
+
"""Decide which extractor applies to a manifest record.
|
|
469
|
+
|
|
470
|
+
Returns ``'qa'`` | ``'atr'`` | ``'observations'`` | ``'skip'``.
|
|
471
|
+
|
|
472
|
+
The ``'observations'`` bucket covers any committee report that
|
|
473
|
+
contains numbered Observations/Recommendations text — that's
|
|
474
|
+
Demands for Grants reports, Bill examinations, and Subject (own-
|
|
475
|
+
initiative) reports. They share a textual structure (numbered
|
|
476
|
+
paragraphs in a section heading variant of OBSERVATIONS /
|
|
477
|
+
RECOMMENDATIONS) so they share an extractor; the *source*
|
|
478
|
+
``report_type`` from the manifest is preserved on each output
|
|
479
|
+
record as ``source_report_type`` so downstream filters can
|
|
480
|
+
distinguish them.
|
|
481
|
+
|
|
482
|
+
Prior to v0.6.3 this function returned ``'dfg'`` for everything
|
|
483
|
+
non-ATR, which mislabelled subject and bill reports as DFG.
|
|
484
|
+
"""
|
|
485
|
+
kind = rec.get("kind") or ""
|
|
486
|
+
report_type = rec.get("report_type") or ""
|
|
487
|
+
if kind == "qa":
|
|
488
|
+
return "qa"
|
|
489
|
+
if kind == "committee_report":
|
|
490
|
+
if report_type == "action_taken":
|
|
491
|
+
return "atr"
|
|
492
|
+
# demands_for_grants | bill | subject | other (legacy "original")
|
|
493
|
+
# all dispatch to the observations extractor.
|
|
494
|
+
return "observations"
|
|
495
|
+
return "skip"
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _pdf_for_record(rec: dict, out_dir: Path) -> Path | None:
|
|
499
|
+
rel = rec.get("pdf_path")
|
|
500
|
+
if not rel:
|
|
501
|
+
return None
|
|
502
|
+
p = out_dir / rel
|
|
503
|
+
return p if p.exists() and p.stat().st_size > 1000 else None
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def extract_answers(
|
|
507
|
+
out_dir: Path, *, refresh: bool = False, log_fn=print
|
|
508
|
+
) -> ExtractionStats:
|
|
509
|
+
"""Walk ``manifest.jsonl``, run the right extractor per record, write
|
|
510
|
+
``answers.jsonl``. Returns stats for telemetry / CLI output.
|
|
511
|
+
|
|
512
|
+
Idempotent: ``answers.jsonl`` is overwritten, but the input
|
|
513
|
+
(``manifest.jsonl`` + downloaded PDFs) is unchanged. ``refresh=True``
|
|
514
|
+
forces re-extraction; otherwise existing ``answers.jsonl`` is replaced
|
|
515
|
+
with current parser output.
|
|
516
|
+
"""
|
|
517
|
+
stats = ExtractionStats()
|
|
518
|
+
manifest_path = out_dir / "manifest.jsonl"
|
|
519
|
+
out_path = out_dir / "answers.jsonl"
|
|
520
|
+
records = read_jsonl(manifest_path)
|
|
521
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
522
|
+
|
|
523
|
+
out_records: list[dict] = []
|
|
524
|
+
for rec in records:
|
|
525
|
+
kind = _classify_source(rec)
|
|
526
|
+
if kind == "skip":
|
|
527
|
+
continue
|
|
528
|
+
stats.sources_processed += 1
|
|
529
|
+
pdf = _pdf_for_record(rec, out_dir)
|
|
530
|
+
if not pdf:
|
|
531
|
+
stats.skipped_no_pdf += 1
|
|
532
|
+
continue
|
|
533
|
+
try:
|
|
534
|
+
text = extract_pdf_text(pdf)
|
|
535
|
+
except Exception as exc: # noqa: BLE001
|
|
536
|
+
stats.errors.append({"key": rec.get("key"), "where": "pdftotext", "error": repr(exc)})
|
|
537
|
+
continue
|
|
538
|
+
if not text or not text.strip():
|
|
539
|
+
stats.skipped_no_text += 1
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
common = {
|
|
543
|
+
"key": rec.get("key"),
|
|
544
|
+
"run_id": rec.get("run_id"),
|
|
545
|
+
"source_pdf": str(pdf.relative_to(out_dir)),
|
|
546
|
+
"extracted_at": _now(),
|
|
547
|
+
"language_classified": ["en"],
|
|
548
|
+
# Carry the manifest's report_type forward so downstream
|
|
549
|
+
# consumers can distinguish numbered observations that came
|
|
550
|
+
# from a Demands-for-Grants vs Bill vs Subject report. Prior
|
|
551
|
+
# to v0.6.3 this distinction was not surfaced and all non-ATR
|
|
552
|
+
# records were tagged 'dfg_recommendation' regardless of source.
|
|
553
|
+
"source_report_type": rec.get("report_type"),
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
if kind == "qa":
|
|
558
|
+
qa = split_qa(text)
|
|
559
|
+
if qa is None:
|
|
560
|
+
stats.skipped_no_split += 1
|
|
561
|
+
continue
|
|
562
|
+
out_records.append({**common, **qa.to_record()})
|
|
563
|
+
stats.qa_records += 1
|
|
564
|
+
elif kind == "atr":
|
|
565
|
+
items = split_atr(text)
|
|
566
|
+
if not items:
|
|
567
|
+
stats.skipped_no_split += 1
|
|
568
|
+
continue
|
|
569
|
+
for item in items:
|
|
570
|
+
out_records.append({**common, **item.to_record()})
|
|
571
|
+
stats.atr_records += len(items)
|
|
572
|
+
elif kind == "observations":
|
|
573
|
+
items = split_dfg(text)
|
|
574
|
+
if not items:
|
|
575
|
+
stats.skipped_no_split += 1
|
|
576
|
+
continue
|
|
577
|
+
for item in items:
|
|
578
|
+
out_records.append({**common, **item.to_record()})
|
|
579
|
+
stats.dfg_records += len(items)
|
|
580
|
+
except Exception as exc: # noqa: BLE001
|
|
581
|
+
stats.errors.append({"key": rec.get("key"), "where": kind, "error": repr(exc)})
|
|
582
|
+
|
|
583
|
+
# Write atomically: write to a sibling temp then rename. Use ``with_name``
|
|
584
|
+
# rather than ``with_suffix`` because ``Path("answers.jsonl").with_suffix(
|
|
585
|
+
# ".jsonl.tmp")`` is ambiguous across pathlib versions.
|
|
586
|
+
tmp = out_path.with_name(out_path.name + ".tmp")
|
|
587
|
+
with tmp.open("w", encoding="utf-8") as f:
|
|
588
|
+
for row in out_records:
|
|
589
|
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
590
|
+
tmp.replace(out_path)
|
|
591
|
+
|
|
592
|
+
log_fn(
|
|
593
|
+
f"answers.jsonl: qa={stats.qa_records} atr={stats.atr_records} "
|
|
594
|
+
f"dfg={stats.dfg_records} skipped_no_pdf={stats.skipped_no_pdf} "
|
|
595
|
+
f"skipped_no_text={stats.skipped_no_text} skipped_no_split={stats.skipped_no_split} "
|
|
596
|
+
f"errors={len(stats.errors)}"
|
|
597
|
+
)
|
|
598
|
+
return stats
|