esg-cid-plus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,244 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ !.env.example
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Abstra
180
+ # Abstra is an AI-powered process automation framework.
181
+ # Ignore directories containing user credentials, local state, and settings.
182
+ # Learn more at https://abstra.io/docs
183
+ .abstra/
184
+
185
+ # Visual Studio Code
186
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
187
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
188
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
189
+ # you could uncomment the following to ignore the entire vscode folder
190
+ # .vscode/
191
+
192
+ # Ruff stuff:
193
+ .ruff_cache/
194
+
195
+ # PyPI configuration file
196
+ .pypirc
197
+
198
+ # Cursor
199
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
200
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
201
+ # refer to https://docs.cursor.com/context/ignore-files
202
+ .cursorignore
203
+ .cursorindexingignore
204
+
205
+ # Marimo
206
+ marimo/_static/
207
+ marimo/_lsp/
208
+ __marimo__/
209
+
210
+ # ACM Paper
211
+ acm_paper/
212
+
213
+ .vscode/
214
+ .claude/
215
+ .codex/
216
+
217
+ .awd/
218
+
219
+ data/
220
+ !libs/**/data/
221
+
222
+ archive/
223
+
224
+ output/
225
+ outputs/
226
+ optimized/
227
+
228
+ **.aux
229
+ **.out
230
+ **.bbl
231
+ **.blg
232
+ **.fdb_latexmk
233
+ **.fls
234
+ neurips_paper/
235
+
236
+ # mlflow
237
+ mlflow*.db
238
+ mlartifacts*/
239
+ mlruns/
240
+
241
+ .superpowers/
242
+
243
+ # private skills (must not leak into eval / released dataset)
244
+ .agents/skills/annotate-gri-hard/
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: esg-cid-plus
3
+ Version: 0.1.0
4
+ Summary: ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports
5
+ Project-URL: Homepage, https://github.com/anomalyco/esg-cid-plus
6
+ Author-email: Rehan Ahmed <shafiuddin.r.ahmed@accenture.com>
7
+ License: MIT
8
+ Keywords: benchmark,content-index,esg,pdf-extraction,sustainability
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.13
17
+ Requires-Dist: camelot-py>=0.11.0
18
+ Requires-Dist: datasets>=2.0
19
+ Requires-Dist: img2table>=2.0
20
+ Requires-Dist: lunr>=0.7.0
21
+ Requires-Dist: opencv-python-headless>=4.8
22
+ Requires-Dist: pdfminer-six>=20231228
23
+ Requires-Dist: pdfplumber>=0.11.9
24
+ Requires-Dist: pillow>=10.0.0
25
+ Requires-Dist: pydantic>=2.0
26
+ Requires-Dist: pypdfium2>=4.0.0
27
+ Requires-Dist: python-dotenv>=1.0
28
+ Requires-Dist: rapidocr-onnxruntime>=1.4
29
+ Description-Content-Type: text/markdown
30
+
31
+ # esg-cid-plus
32
+
33
+ Supporting code for the EMNLP 2026 paper:
34
+ **ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports**
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ uv pip install -e .
40
+ ```
41
+
42
+ ## Modules
43
+
44
+ | Module | Purpose |
45
+ |--------|---------|
46
+ | `cid` | Data models (`CIRow`, `ContentIndex`, `FrameworkSpec`) and normalization helpers |
47
+ | `data` | HuggingFace loaders — `load_split`, `load_framework`, `resolve_pdf_path` |
48
+ | `pdf_toolkit` | `PDFSession` — text extraction, lunr search, camelot tables, page labels |
49
+ | `pdf_toolkit_ocr` | `OCRPDFSession` — extends above with image rendering, rapidocr, img2table |
50
+ | `extract` | Rule-based pipeline: S1 detect → S2/S3 draft → S4 resolve → `ContentIndex` |
51
+ | `eval` | Tuple-level P/R/F1 — `evaluate_predictions`, `format_feedback` |
52
+
53
+ ## Quickstart
54
+
55
+ ```python
56
+ from esg_cid_plus.data import load_split, load_framework, resolve_pdf_path
57
+ from esg_cid_plus.extract import extract
58
+ from esg_cid_plus.eval import evaluate_predictions
59
+
60
+ pdf_rows, cid_rows = load_split("train_small")
61
+ fw = load_framework("gri_2021")
62
+
63
+ pdf_path = resolve_pdf_path(pdf_rows[0])
64
+ ci = extract(pdf_path, fw)
65
+
66
+ gt = [r for r in cid_rows if r["report_name"] == pdf_rows[0]["report_name"]]
67
+ pred = [{"report_name": ci.report_name, "standard_id": r.standard_id,
68
+ "page_location_indices": r.page_location_indices} for r in ci.rows]
69
+
70
+ _, overall = evaluate_predictions(gt, pred)
71
+ print(overall)
72
+ ```
73
+
74
+ ## Tests
75
+
76
+ ```bash
77
+ uv run pytest # fast (unit)
78
+ uv run pytest -m slow # integration — requires HuggingFace access
79
+ ```
@@ -0,0 +1,49 @@
1
+ # esg-cid-plus
2
+
3
+ Supporting code for the EMNLP 2026 paper:
4
+ **ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports**
5
+
6
+ ## Install
7
+
8
+ ```bash
9
+ uv pip install -e .
10
+ ```
11
+
12
+ ## Modules
13
+
14
+ | Module | Purpose |
15
+ |--------|---------|
16
+ | `cid` | Data models (`CIRow`, `ContentIndex`, `FrameworkSpec`) and normalization helpers |
17
+ | `data` | HuggingFace loaders — `load_split`, `load_framework`, `resolve_pdf_path` |
18
+ | `pdf_toolkit` | `PDFSession` — text extraction, lunr search, camelot tables, page labels |
19
+ | `pdf_toolkit_ocr` | `OCRPDFSession` — extends above with image rendering, rapidocr, img2table |
20
+ | `extract` | Rule-based pipeline: S1 detect → S2/S3 draft → S4 resolve → `ContentIndex` |
21
+ | `eval` | Tuple-level P/R/F1 — `evaluate_predictions`, `format_feedback` |
22
+
23
+ ## Quickstart
24
+
25
+ ```python
26
+ from esg_cid_plus.data import load_split, load_framework, resolve_pdf_path
27
+ from esg_cid_plus.extract import extract
28
+ from esg_cid_plus.eval import evaluate_predictions
29
+
30
+ pdf_rows, cid_rows = load_split("train_small")
31
+ fw = load_framework("gri_2021")
32
+
33
+ pdf_path = resolve_pdf_path(pdf_rows[0])
34
+ ci = extract(pdf_path, fw)
35
+
36
+ gt = [r for r in cid_rows if r["report_name"] == pdf_rows[0]["report_name"]]
37
+ pred = [{"report_name": ci.report_name, "standard_id": r.standard_id,
38
+ "page_location_indices": r.page_location_indices} for r in ci.rows]
39
+
40
+ _, overall = evaluate_predictions(gt, pred)
41
+ print(overall)
42
+ ```
43
+
44
+ ## Tests
45
+
46
+ ```bash
47
+ uv run pytest # fast (unit)
48
+ uv run pytest -m slow # integration — requires HuggingFace access
49
+ ```
@@ -0,0 +1,60 @@
1
+ [project]
2
+ name = "esg-cid-plus"
3
+ version = "0.1.0"
4
+ description = "ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [{ name = "Rehan Ahmed", email = "shafiuddin.r.ahmed@accenture.com" }]
8
+ keywords = ["esg", "content-index", "pdf-extraction", "benchmark", "sustainability"]
9
+ classifiers = [
10
+ "Development Status :: 4 - Beta",
11
+ "Intended Audience :: Science/Research",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.13",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ ]
18
+ requires-python = ">=3.13"
19
+ dependencies = [
20
+ "pdfplumber>=0.11.9",
21
+ "pdfminer.six>=20231228",
22
+ "camelot-py>=0.11.0",
23
+ "opencv-python-headless>=4.8",
24
+ "lunr>=0.7.0",
25
+ "pydantic>=2.0",
26
+ "datasets>=2.0",
27
+ "python-dotenv>=1.0",
28
+ "pypdfium2>=4.0.0",
29
+ "rapidocr-onnxruntime>=1.4",
30
+ "img2table>=2.0",
31
+ "Pillow>=10.0.0",
32
+ ]
33
+
34
+ [project.urls]
35
+ Homepage = "https://github.com/anomalyco/esg-cid-plus"
36
+
37
+ [dependency-groups]
38
+ dev = ["pytest", "ruff"]
39
+
40
+ [build-system]
41
+ requires = ["hatchling"]
42
+ build-backend = "hatchling.build"
43
+
44
+ [tool.hatch.build.targets.wheel]
45
+ packages = ["src/esg_cid_plus"]
46
+
47
+ [tool.ruff]
48
+ line-length = 100
49
+ target-version = "py313"
50
+ extend-exclude = [".awd"]
51
+
52
+ [tool.ruff.lint]
53
+ select = ["E", "F", "I", "W", "UP"]
54
+ ignore = ["E501"]
55
+
56
+ [tool.pytest.ini_options]
57
+ testpaths = ["tests"]
58
+ markers = [
59
+ "slow: integration tests that download real PDFs from HuggingFace (deselect with -m 'not slow')",
60
+ ]
File without changes
@@ -0,0 +1,189 @@
1
+ """CID data models and normalization helpers.
2
+
3
+ Defines the core types for Content Index Detection:
4
+ - CIRow / ContentIndex — pydantic models for one row / one full index
5
+ - FrameworkSpec / DisclosureDef — framework schema loaded from HuggingFace
6
+ - normalize_standard_id / parse_page_indices / row_page_set
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import re
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+
16
+ from pydantic import BaseModel, Field, field_validator
17
+
18
+ # ── Normalization helpers ─────────────────────────────────────────────────────
19
+
20
+ _STANDARD_ID_DASH_RE = re.compile(r"\s*[-–—]\s*")
21
+ _PAGE_RANGE_TO_RE = re.compile(r"(\d+)\s+to\s+(\d+)")
22
+ _PAGE_SENTINELS = {"n/a", "na", "-", "notapplicable", "not applicable", ""}
23
+
24
+
25
+ def normalize_standard_id(sid: str) -> str:
26
+ """Strip framework prefixes and normalize dashes.
27
+
28
+ >>> normalize_standard_id("GRI 2-1")
29
+ '2-1'
30
+ >>> normalize_standard_id("102 - 1")
31
+ '102-1'
32
+ """
33
+ sid = sid.strip()
34
+ for prefix in ("GRI ", "ESRS ", "SASB "):
35
+ if sid.startswith(prefix):
36
+ sid = sid[len(prefix):]
37
+ break
38
+ return _STANDARD_ID_DASH_RE.sub("-", sid.strip())
39
+
40
+
41
+ def parse_page_indices(v: str | list[int]) -> list[int]:
42
+ """Parse 0-based page indices from a string or list, expanding ranges.
43
+
44
+ >>> parse_page_indices("133-137")
45
+ [133, 134, 135, 136, 137]
46
+ >>> parse_page_indices("2, 4, 16-23")
47
+ [2, 4, 16, 17, 18, 19, 20, 21, 22, 23]
48
+ """
49
+ if isinstance(v, list):
50
+ return list(v)
51
+ v = _PAGE_RANGE_TO_RE.sub(r"\1-\2", v.strip())
52
+ result: list[int] = []
53
+ for part in v.split(","):
54
+ part = part.strip()
55
+ if not part or part.lower() in _PAGE_SENTINELS:
56
+ continue
57
+ if "-" in part and part[0] != "-":
58
+ try:
59
+ lo, hi = part.split("-", 1)
60
+ result.extend(range(int(lo), int(hi) + 1))
61
+ except ValueError:
62
+ continue
63
+ else:
64
+ try:
65
+ result.append(int(part))
66
+ except ValueError:
67
+ continue
68
+ return result
69
+
70
+
71
+ def row_page_set(row: dict) -> set[int]:
72
+ """Pull 0-based page indices from a CID row dict into a set."""
73
+ return set(parse_page_indices(row.get("page_location_indices", "")))
74
+
75
+
76
+ # ── Pydantic models ───────────────────────────────────────────────────────────
77
+
78
+
79
+ class CIRow(BaseModel):
80
+ """One row of a Content Index.
81
+
82
+ Pairs a framework disclosure with where it appears in the PDF.
83
+ ``page_location_text`` is verbatim; ``page_location_indices`` are resolved 0-based indices.
84
+ """
85
+
86
+ standard_id: str
87
+ disclosure_text: str = Field(default="")
88
+ disclosure_text_extracted: str = Field(default="")
89
+ page_location_text: str = Field(default="")
90
+ page_location_indices: list[int] = Field(default_factory=list)
91
+
92
+ @field_validator("standard_id")
93
+ @classmethod
94
+ def _normalize_sid(cls, v: str) -> str:
95
+ return normalize_standard_id(v)
96
+
97
+ @field_validator("page_location_indices", mode="before")
98
+ @classmethod
99
+ def _coerce_indices(cls, v: str | list[int]) -> list[int]:
100
+ return parse_page_indices(v)
101
+
102
+ @field_validator("page_location_text", mode="before")
103
+ @classmethod
104
+ def _coerce_text(cls, v) -> str:
105
+ if v is None:
106
+ return ""
107
+ if isinstance(v, list):
108
+ return ", ".join(str(x) for x in v if x is not None and str(x).strip())
109
+ return str(v).strip()
110
+
111
+
112
+ class ContentIndex(BaseModel):
113
+ """A full Content Index for one PDF — one CIRow per framework disclosure."""
114
+
115
+ report_name: str
116
+ framework: str
117
+ num_disclosures: int = Field(ge=0)
118
+ ci_page_labels: str = Field(default="")
119
+ ci_page_indices: list[int] = Field(default_factory=list)
120
+ rows: list[CIRow]
121
+
122
+ @field_validator("ci_page_indices", mode="before")
123
+ @classmethod
124
+ def _coerce_ci_indices(cls, v: str | list[int]) -> list[int]:
125
+ return parse_page_indices(v)
126
+
127
+ @classmethod
128
+ def from_json_file(cls, path: str) -> ContentIndex:
129
+ with open(Path(path)) as f:
130
+ return cls.model_validate(json.load(f))
131
+
132
+ def to_json_file(self, path: str) -> None:
133
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
134
+ with open(path, "w") as f:
135
+ json.dump(self.model_dump(), f, indent=2)
136
+
137
+ def to_cid_rows(self) -> list[dict]:
138
+ """Convert to a list of dicts matching the HuggingFace ``cid`` config schema."""
139
+ return [
140
+ {
141
+ "report_name": self.report_name,
142
+ "framework": self.framework,
143
+ "standard_id": r.standard_id,
144
+ "disclosure_text": r.disclosure_text,
145
+ "page_location_labels": r.page_location_text,
146
+ "page_location_indices": r.page_location_indices,
147
+ }
148
+ for r in self.rows
149
+ ]
150
+
151
+
152
+ # ── Framework spec ────────────────────────────────────────────────────────────
153
+
154
+
155
+ @dataclass(frozen=True)
156
+ class DisclosureDef:
157
+ """Definition of a single disclosure from the official framework template."""
158
+
159
+ standard_id: str
160
+ title: str
161
+ parent_standard: str = ""
162
+
163
+
164
+ @dataclass
165
+ class FrameworkSpec:
166
+ """A reporting framework specification (loaded from HuggingFace).
167
+
168
+ ``group_pattern`` is a compiled regex that matches any standard_id
169
+ belonging to this framework — used by the page detector and drafter.
170
+ """
171
+
172
+ id: str
173
+ display_name: str
174
+ group_pattern: re.Pattern[str] | None = None
175
+ disclosures: dict[str, DisclosureDef] = field(default_factory=dict)
176
+
177
+ def has_disclosure(self, standard_id: str) -> bool:
178
+ return standard_id in self.disclosures
179
+
180
+ def get_title(self, standard_id: str) -> str | None:
181
+ d = self.disclosures.get(standard_id)
182
+ return d.title if d else None
183
+
184
+ def detect(self, standard_ids: list[str]) -> bool:
185
+ """Heuristic: do these IDs look like they belong to this framework?"""
186
+ if not standard_ids:
187
+ return False
188
+ matched = sum(1 for sid in standard_ids if sid in self.disclosures)
189
+ return matched / len(standard_ids) > 0.5