esg-cid-plus 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esg_cid_plus-0.1.0/.gitignore +244 -0
- esg_cid_plus-0.1.0/PKG-INFO +79 -0
- esg_cid_plus-0.1.0/README.md +49 -0
- esg_cid_plus-0.1.0/pyproject.toml +60 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/__init__.py +0 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/cid.py +189 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/data.py +250 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/eval.py +297 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/extract.py +380 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/pdf_toolkit.py +474 -0
- esg_cid_plus-0.1.0/src/esg_cid_plus/pdf_toolkit_ocr.py +306 -0
- esg_cid_plus-0.1.0/tests/conftest.py +76 -0
- esg_cid_plus-0.1.0/tests/test_cid.py +185 -0
- esg_cid_plus-0.1.0/tests/test_data.py +191 -0
- esg_cid_plus-0.1.0/tests/test_eval.py +247 -0
- esg_cid_plus-0.1.0/tests/test_extract.py +234 -0
- esg_cid_plus-0.1.0/tests/test_integration.py +270 -0
- esg_cid_plus-0.1.0/tests/test_pdf_toolkit.py +248 -0
- esg_cid_plus-0.1.0/tests/test_pdf_toolkit_ocr.py +162 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
!.env.example
|
|
141
|
+
.venv
|
|
142
|
+
env/
|
|
143
|
+
venv/
|
|
144
|
+
ENV/
|
|
145
|
+
env.bak/
|
|
146
|
+
venv.bak/
|
|
147
|
+
|
|
148
|
+
# Spyder project settings
|
|
149
|
+
.spyderproject
|
|
150
|
+
.spyproject
|
|
151
|
+
|
|
152
|
+
# Rope project settings
|
|
153
|
+
.ropeproject
|
|
154
|
+
|
|
155
|
+
# mkdocs documentation
|
|
156
|
+
/site
|
|
157
|
+
|
|
158
|
+
# mypy
|
|
159
|
+
.mypy_cache/
|
|
160
|
+
.dmypy.json
|
|
161
|
+
dmypy.json
|
|
162
|
+
|
|
163
|
+
# Pyre type checker
|
|
164
|
+
.pyre/
|
|
165
|
+
|
|
166
|
+
# pytype static type analyzer
|
|
167
|
+
.pytype/
|
|
168
|
+
|
|
169
|
+
# Cython debug symbols
|
|
170
|
+
cython_debug/
|
|
171
|
+
|
|
172
|
+
# PyCharm
|
|
173
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
174
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
175
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
176
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
177
|
+
#.idea/
|
|
178
|
+
|
|
179
|
+
# Abstra
|
|
180
|
+
# Abstra is an AI-powered process automation framework.
|
|
181
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
182
|
+
# Learn more at https://abstra.io/docs
|
|
183
|
+
.abstra/
|
|
184
|
+
|
|
185
|
+
# Visual Studio Code
|
|
186
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
187
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
188
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
189
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
190
|
+
# .vscode/
|
|
191
|
+
|
|
192
|
+
# Ruff stuff:
|
|
193
|
+
.ruff_cache/
|
|
194
|
+
|
|
195
|
+
# PyPI configuration file
|
|
196
|
+
.pypirc
|
|
197
|
+
|
|
198
|
+
# Cursor
|
|
199
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
200
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
201
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
202
|
+
.cursorignore
|
|
203
|
+
.cursorindexingignore
|
|
204
|
+
|
|
205
|
+
# Marimo
|
|
206
|
+
marimo/_static/
|
|
207
|
+
marimo/_lsp/
|
|
208
|
+
__marimo__/
|
|
209
|
+
|
|
210
|
+
# ACM Paper
|
|
211
|
+
acm_paper/
|
|
212
|
+
|
|
213
|
+
.vscode/
|
|
214
|
+
.claude/
|
|
215
|
+
.codex/
|
|
216
|
+
|
|
217
|
+
.awd/
|
|
218
|
+
|
|
219
|
+
data/
|
|
220
|
+
!libs/**/data/
|
|
221
|
+
|
|
222
|
+
archive/
|
|
223
|
+
|
|
224
|
+
output/
|
|
225
|
+
outputs/
|
|
226
|
+
optimized/
|
|
227
|
+
|
|
228
|
+
**.aux
|
|
229
|
+
**.out
|
|
230
|
+
**.bbl
|
|
231
|
+
**.blg
|
|
232
|
+
**.fdb_latexmk
|
|
233
|
+
**.fls
|
|
234
|
+
neurips_paper/
|
|
235
|
+
|
|
236
|
+
# mlflow
|
|
237
|
+
mlflow*.db
|
|
238
|
+
mlartifacts*/
|
|
239
|
+
mlruns/
|
|
240
|
+
|
|
241
|
+
.superpowers/
|
|
242
|
+
|
|
243
|
+
# private skills (must not leak into eval / released dataset)
|
|
244
|
+
.agents/skills/annotate-gri-hard/
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: esg-cid-plus
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports
|
|
5
|
+
Project-URL: Homepage, https://github.com/anomalyco/esg-cid-plus
|
|
6
|
+
Author-email: Rehan Ahmed <shafiuddin.r.ahmed@accenture.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: benchmark,content-index,esg,pdf-extraction,sustainability
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.13
|
|
17
|
+
Requires-Dist: camelot-py>=0.11.0
|
|
18
|
+
Requires-Dist: datasets>=2.0
|
|
19
|
+
Requires-Dist: img2table>=2.0
|
|
20
|
+
Requires-Dist: lunr>=0.7.0
|
|
21
|
+
Requires-Dist: opencv-python-headless>=4.8
|
|
22
|
+
Requires-Dist: pdfminer-six>=20231228
|
|
23
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
24
|
+
Requires-Dist: pillow>=10.0.0
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Requires-Dist: pypdfium2>=4.0.0
|
|
27
|
+
Requires-Dist: python-dotenv>=1.0
|
|
28
|
+
Requires-Dist: rapidocr-onnxruntime>=1.4
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# esg-cid-plus
|
|
32
|
+
|
|
33
|
+
Supporting code for the EMNLP 2026 paper:
|
|
34
|
+
**ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports**
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
uv pip install -e .
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Modules
|
|
43
|
+
|
|
44
|
+
| Module | Purpose |
|
|
45
|
+
|--------|---------|
|
|
46
|
+
| `cid` | Data models (`CIRow`, `ContentIndex`, `FrameworkSpec`) and normalization helpers |
|
|
47
|
+
| `data` | HuggingFace loaders — `load_split`, `load_framework`, `resolve_pdf_path` |
|
|
48
|
+
| `pdf_toolkit` | `PDFSession` — text extraction, lunr search, camelot tables, page labels |
|
|
49
|
+
| `pdf_toolkit_ocr` | `OCRPDFSession` — extends above with image rendering, rapidocr, img2table |
|
|
50
|
+
| `extract` | Rule-based pipeline: S1 detect → S2/S3 draft → S4 resolve → `ContentIndex` |
|
|
51
|
+
| `eval` | Tuple-level P/R/F1 — `evaluate_predictions`, `format_feedback` |
|
|
52
|
+
|
|
53
|
+
## Quickstart
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from esg_cid_plus.data import load_split, load_framework, resolve_pdf_path
|
|
57
|
+
from esg_cid_plus.extract import extract
|
|
58
|
+
from esg_cid_plus.eval import evaluate_predictions
|
|
59
|
+
|
|
60
|
+
pdf_rows, cid_rows = load_split("train_small")
|
|
61
|
+
fw = load_framework("gri_2021")
|
|
62
|
+
|
|
63
|
+
pdf_path = resolve_pdf_path(pdf_rows[0])
|
|
64
|
+
ci = extract(pdf_path, fw)
|
|
65
|
+
|
|
66
|
+
gt = [r for r in cid_rows if r["report_name"] == pdf_rows[0]["report_name"]]
|
|
67
|
+
pred = [{"report_name": ci.report_name, "standard_id": r.standard_id,
|
|
68
|
+
"page_location_indices": r.page_location_indices} for r in ci.rows]
|
|
69
|
+
|
|
70
|
+
_, overall = evaluate_predictions(gt, pred)
|
|
71
|
+
print(overall)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Tests
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
uv run pytest # fast (unit)
|
|
78
|
+
uv run pytest -m slow # integration — requires HuggingFace access
|
|
79
|
+
```
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# esg-cid-plus
|
|
2
|
+
|
|
3
|
+
Supporting code for the EMNLP 2026 paper:
|
|
4
|
+
**ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports**
|
|
5
|
+
|
|
6
|
+
## Install
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
uv pip install -e .
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Modules
|
|
13
|
+
|
|
14
|
+
| Module | Purpose |
|
|
15
|
+
|--------|---------|
|
|
16
|
+
| `cid` | Data models (`CIRow`, `ContentIndex`, `FrameworkSpec`) and normalization helpers |
|
|
17
|
+
| `data` | HuggingFace loaders — `load_split`, `load_framework`, `resolve_pdf_path` |
|
|
18
|
+
| `pdf_toolkit` | `PDFSession` — text extraction, lunr search, camelot tables, page labels |
|
|
19
|
+
| `pdf_toolkit_ocr` | `OCRPDFSession` — extends above with image rendering, rapidocr, img2table |
|
|
20
|
+
| `extract` | Rule-based pipeline: S1 detect → S2/S3 draft → S4 resolve → `ContentIndex` |
|
|
21
|
+
| `eval` | Tuple-level P/R/F1 — `evaluate_predictions`, `format_feedback` |
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from esg_cid_plus.data import load_split, load_framework, resolve_pdf_path
|
|
27
|
+
from esg_cid_plus.extract import extract
|
|
28
|
+
from esg_cid_plus.eval import evaluate_predictions
|
|
29
|
+
|
|
30
|
+
pdf_rows, cid_rows = load_split("train_small")
|
|
31
|
+
fw = load_framework("gri_2021")
|
|
32
|
+
|
|
33
|
+
pdf_path = resolve_pdf_path(pdf_rows[0])
|
|
34
|
+
ci = extract(pdf_path, fw)
|
|
35
|
+
|
|
36
|
+
gt = [r for r in cid_rows if r["report_name"] == pdf_rows[0]["report_name"]]
|
|
37
|
+
pred = [{"report_name": ci.report_name, "standard_id": r.standard_id,
|
|
38
|
+
"page_location_indices": r.page_location_indices} for r in ci.rows]
|
|
39
|
+
|
|
40
|
+
_, overall = evaluate_predictions(gt, pred)
|
|
41
|
+
print(overall)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Tests
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
uv run pytest # fast (unit)
|
|
48
|
+
uv run pytest -m slow # integration — requires HuggingFace access
|
|
49
|
+
```
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "esg-cid-plus"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "ESG-CID-Plus: A Stratified Benchmark for Disclosure Content Index Table Extraction from Corporate Sustainability Reports"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [{ name = "Rehan Ahmed", email = "shafiuddin.r.ahmed@accenture.com" }]
|
|
8
|
+
keywords = ["esg", "content-index", "pdf-extraction", "benchmark", "sustainability"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 4 - Beta",
|
|
11
|
+
"Intended Audience :: Science/Research",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.13",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
]
|
|
18
|
+
requires-python = ">=3.13"
|
|
19
|
+
dependencies = [
|
|
20
|
+
"pdfplumber>=0.11.9",
|
|
21
|
+
"pdfminer.six>=20231228",
|
|
22
|
+
"camelot-py>=0.11.0",
|
|
23
|
+
"opencv-python-headless>=4.8",
|
|
24
|
+
"lunr>=0.7.0",
|
|
25
|
+
"pydantic>=2.0",
|
|
26
|
+
"datasets>=2.0",
|
|
27
|
+
"python-dotenv>=1.0",
|
|
28
|
+
"pypdfium2>=4.0.0",
|
|
29
|
+
"rapidocr-onnxruntime>=1.4",
|
|
30
|
+
"img2table>=2.0",
|
|
31
|
+
"Pillow>=10.0.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/anomalyco/esg-cid-plus"
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = ["pytest", "ruff"]
|
|
39
|
+
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["hatchling"]
|
|
42
|
+
build-backend = "hatchling.build"
|
|
43
|
+
|
|
44
|
+
[tool.hatch.build.targets.wheel]
|
|
45
|
+
packages = ["src/esg_cid_plus"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
line-length = 100
|
|
49
|
+
target-version = "py313"
|
|
50
|
+
extend-exclude = [".awd"]
|
|
51
|
+
|
|
52
|
+
[tool.ruff.lint]
|
|
53
|
+
select = ["E", "F", "I", "W", "UP"]
|
|
54
|
+
ignore = ["E501"]
|
|
55
|
+
|
|
56
|
+
[tool.pytest.ini_options]
|
|
57
|
+
testpaths = ["tests"]
|
|
58
|
+
markers = [
|
|
59
|
+
"slow: integration tests that download real PDFs from HuggingFace (deselect with -m 'not slow')",
|
|
60
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""CID data models and normalization helpers.
|
|
2
|
+
|
|
3
|
+
Defines the core types for Content Index Detection:
|
|
4
|
+
- CIRow / ContentIndex — pydantic models for one row / one full index
|
|
5
|
+
- FrameworkSpec / DisclosureDef — framework schema loaded from HuggingFace
|
|
6
|
+
- normalize_standard_id / parse_page_indices / row_page_set
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field, field_validator
|
|
17
|
+
|
|
18
|
+
# ── Normalization helpers ─────────────────────────────────────────────────────
|
|
19
|
+
|
|
20
|
+
_STANDARD_ID_DASH_RE = re.compile(r"\s*[-–—]\s*")
|
|
21
|
+
_PAGE_RANGE_TO_RE = re.compile(r"(\d+)\s+to\s+(\d+)")
|
|
22
|
+
_PAGE_SENTINELS = {"n/a", "na", "-", "notapplicable", "not applicable", ""}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def normalize_standard_id(sid: str) -> str:
|
|
26
|
+
"""Strip framework prefixes and normalize dashes.
|
|
27
|
+
|
|
28
|
+
>>> normalize_standard_id("GRI 2-1")
|
|
29
|
+
'2-1'
|
|
30
|
+
>>> normalize_standard_id("102 - 1")
|
|
31
|
+
'102-1'
|
|
32
|
+
"""
|
|
33
|
+
sid = sid.strip()
|
|
34
|
+
for prefix in ("GRI ", "ESRS ", "SASB "):
|
|
35
|
+
if sid.startswith(prefix):
|
|
36
|
+
sid = sid[len(prefix):]
|
|
37
|
+
break
|
|
38
|
+
return _STANDARD_ID_DASH_RE.sub("-", sid.strip())
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_page_indices(v: str | list[int]) -> list[int]:
|
|
42
|
+
"""Parse 0-based page indices from a string or list, expanding ranges.
|
|
43
|
+
|
|
44
|
+
>>> parse_page_indices("133-137")
|
|
45
|
+
[133, 134, 135, 136, 137]
|
|
46
|
+
>>> parse_page_indices("2, 4, 16-23")
|
|
47
|
+
[2, 4, 16, 17, 18, 19, 20, 21, 22, 23]
|
|
48
|
+
"""
|
|
49
|
+
if isinstance(v, list):
|
|
50
|
+
return list(v)
|
|
51
|
+
v = _PAGE_RANGE_TO_RE.sub(r"\1-\2", v.strip())
|
|
52
|
+
result: list[int] = []
|
|
53
|
+
for part in v.split(","):
|
|
54
|
+
part = part.strip()
|
|
55
|
+
if not part or part.lower() in _PAGE_SENTINELS:
|
|
56
|
+
continue
|
|
57
|
+
if "-" in part and part[0] != "-":
|
|
58
|
+
try:
|
|
59
|
+
lo, hi = part.split("-", 1)
|
|
60
|
+
result.extend(range(int(lo), int(hi) + 1))
|
|
61
|
+
except ValueError:
|
|
62
|
+
continue
|
|
63
|
+
else:
|
|
64
|
+
try:
|
|
65
|
+
result.append(int(part))
|
|
66
|
+
except ValueError:
|
|
67
|
+
continue
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def row_page_set(row: dict) -> set[int]:
|
|
72
|
+
"""Pull 0-based page indices from a CID row dict into a set."""
|
|
73
|
+
return set(parse_page_indices(row.get("page_location_indices", "")))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ── Pydantic models ───────────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class CIRow(BaseModel):
|
|
80
|
+
"""One row of a Content Index.
|
|
81
|
+
|
|
82
|
+
Pairs a framework disclosure with where it appears in the PDF.
|
|
83
|
+
``page_location_text`` is verbatim; ``page_location_indices`` are resolved 0-based indices.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
standard_id: str
|
|
87
|
+
disclosure_text: str = Field(default="")
|
|
88
|
+
disclosure_text_extracted: str = Field(default="")
|
|
89
|
+
page_location_text: str = Field(default="")
|
|
90
|
+
page_location_indices: list[int] = Field(default_factory=list)
|
|
91
|
+
|
|
92
|
+
@field_validator("standard_id")
|
|
93
|
+
@classmethod
|
|
94
|
+
def _normalize_sid(cls, v: str) -> str:
|
|
95
|
+
return normalize_standard_id(v)
|
|
96
|
+
|
|
97
|
+
@field_validator("page_location_indices", mode="before")
|
|
98
|
+
@classmethod
|
|
99
|
+
def _coerce_indices(cls, v: str | list[int]) -> list[int]:
|
|
100
|
+
return parse_page_indices(v)
|
|
101
|
+
|
|
102
|
+
@field_validator("page_location_text", mode="before")
|
|
103
|
+
@classmethod
|
|
104
|
+
def _coerce_text(cls, v) -> str:
|
|
105
|
+
if v is None:
|
|
106
|
+
return ""
|
|
107
|
+
if isinstance(v, list):
|
|
108
|
+
return ", ".join(str(x) for x in v if x is not None and str(x).strip())
|
|
109
|
+
return str(v).strip()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ContentIndex(BaseModel):
|
|
113
|
+
"""A full Content Index for one PDF — one CIRow per framework disclosure."""
|
|
114
|
+
|
|
115
|
+
report_name: str
|
|
116
|
+
framework: str
|
|
117
|
+
num_disclosures: int = Field(ge=0)
|
|
118
|
+
ci_page_labels: str = Field(default="")
|
|
119
|
+
ci_page_indices: list[int] = Field(default_factory=list)
|
|
120
|
+
rows: list[CIRow]
|
|
121
|
+
|
|
122
|
+
@field_validator("ci_page_indices", mode="before")
|
|
123
|
+
@classmethod
|
|
124
|
+
def _coerce_ci_indices(cls, v: str | list[int]) -> list[int]:
|
|
125
|
+
return parse_page_indices(v)
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def from_json_file(cls, path: str) -> ContentIndex:
|
|
129
|
+
with open(Path(path)) as f:
|
|
130
|
+
return cls.model_validate(json.load(f))
|
|
131
|
+
|
|
132
|
+
def to_json_file(self, path: str) -> None:
|
|
133
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
134
|
+
with open(path, "w") as f:
|
|
135
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
136
|
+
|
|
137
|
+
def to_cid_rows(self) -> list[dict]:
|
|
138
|
+
"""Convert to a list of dicts matching the HuggingFace ``cid`` config schema."""
|
|
139
|
+
return [
|
|
140
|
+
{
|
|
141
|
+
"report_name": self.report_name,
|
|
142
|
+
"framework": self.framework,
|
|
143
|
+
"standard_id": r.standard_id,
|
|
144
|
+
"disclosure_text": r.disclosure_text,
|
|
145
|
+
"page_location_labels": r.page_location_text,
|
|
146
|
+
"page_location_indices": r.page_location_indices,
|
|
147
|
+
}
|
|
148
|
+
for r in self.rows
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ── Framework spec ────────────────────────────────────────────────────────────
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass(frozen=True)
|
|
156
|
+
class DisclosureDef:
|
|
157
|
+
"""Definition of a single disclosure from the official framework template."""
|
|
158
|
+
|
|
159
|
+
standard_id: str
|
|
160
|
+
title: str
|
|
161
|
+
parent_standard: str = ""
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@dataclass
|
|
165
|
+
class FrameworkSpec:
|
|
166
|
+
"""A reporting framework specification (loaded from HuggingFace).
|
|
167
|
+
|
|
168
|
+
``group_pattern`` is a compiled regex that matches any standard_id
|
|
169
|
+
belonging to this framework — used by the page detector and drafter.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
id: str
|
|
173
|
+
display_name: str
|
|
174
|
+
group_pattern: re.Pattern[str] | None = None
|
|
175
|
+
disclosures: dict[str, DisclosureDef] = field(default_factory=dict)
|
|
176
|
+
|
|
177
|
+
def has_disclosure(self, standard_id: str) -> bool:
|
|
178
|
+
return standard_id in self.disclosures
|
|
179
|
+
|
|
180
|
+
def get_title(self, standard_id: str) -> str | None:
|
|
181
|
+
d = self.disclosures.get(standard_id)
|
|
182
|
+
return d.title if d else None
|
|
183
|
+
|
|
184
|
+
def detect(self, standard_ids: list[str]) -> bool:
|
|
185
|
+
"""Heuristic: do these IDs look like they belong to this framework?"""
|
|
186
|
+
if not standard_ids:
|
|
187
|
+
return False
|
|
188
|
+
matched = sum(1 for sid in standard_ids if sid in self.disclosures)
|
|
189
|
+
return matched / len(standard_ids) > 0.5
|