fina-extractor-lib 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extract_lib/__init__.py +18 -0
- extract_lib/base.py +44 -0
- extract_lib/exceptions.py +2 -0
- extract_lib/normalize/__init__.py +3 -0
- extract_lib/normalize/currency.py +7 -0
- extract_lib/normalize/dates.py +17 -0
- extract_lib/normalize/number.py +56 -0
- extract_lib/normalize/period.py +14 -0
- extract_lib/normalize/scale.py +13 -0
- extract_lib/normalize/sign.py +7 -0
- extract_lib/readers/__init__.py +3 -0
- extract_lib/readers/docx_reader.py +18 -0
- extract_lib/readers/html_reader.py +18 -0
- extract_lib/readers/ixbrl.py +25 -0
- extract_lib/readers/ocr.py +3 -0
- extract_lib/readers/pdf_tables.py +30 -0
- extract_lib/readers/pdf_text.py +14 -0
- extract_lib/readers/xlsx_reader.py +17 -0
- extract_lib/register.py +3 -0
- extract_lib/registry.py +63 -0
- extract_lib/resolver.py +85 -0
- extract_lib/structure/__init__.py +3 -0
- extract_lib/structure/cell_cluster.py +17 -0
- extract_lib/structure/header_detect.py +17 -0
- extract_lib/structure/table_model.py +11 -0
- extract_lib/synonyms.py +43 -0
- fina_extractor_lib/__init__.py +1 -0
- fina_extractor_lib-0.1.0.dist-info/METADATA +17 -0
- fina_extractor_lib-0.1.0.dist-info/RECORD +31 -0
- fina_extractor_lib-0.1.0.dist-info/WHEEL +5 -0
- fina_extractor_lib-0.1.0.dist-info/top_level.txt +2 -0
extract_lib/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .base import ExtractResult, RUNTIME_EXTRACTOR_REGISTRY, build_extract, extractor
|
|
2
|
+
from .registry import DOMAINS, all_extractors, count_summary
|
|
3
|
+
from . import resolver
|
|
4
|
+
from .normalize import number, scale, sign
|
|
5
|
+
from .readers import ixbrl, pdf_tables
|
|
6
|
+
|
|
7
|
+
EXTRACTOR_REGISTRY = RUNTIME_EXTRACTOR_REGISTRY
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"DOMAINS",
|
|
11
|
+
"all_extractors",
|
|
12
|
+
"count_summary",
|
|
13
|
+
"ExtractResult",
|
|
14
|
+
"EXTRACTOR_REGISTRY",
|
|
15
|
+
"extractor",
|
|
16
|
+
"build_extract",
|
|
17
|
+
"resolver",
|
|
18
|
+
]
|
extract_lib/base.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ExtractResult:
|
|
9
|
+
value: Any
|
|
10
|
+
metric_id: str
|
|
11
|
+
raw_value: str = ""
|
|
12
|
+
unit: str = ""
|
|
13
|
+
scale: float = 1.0
|
|
14
|
+
period: str = ""
|
|
15
|
+
page: Optional[int] = None
|
|
16
|
+
table_number: Optional[int] = None
|
|
17
|
+
row_header: str = ""
|
|
18
|
+
col_header: str = ""
|
|
19
|
+
source: str = ""
|
|
20
|
+
confidence: float = 0.0
|
|
21
|
+
valid: bool = True
|
|
22
|
+
error: Optional[str] = None
|
|
23
|
+
candidates: List[Dict[str, Any]] = field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
RUNTIME_EXTRACTOR_REGISTRY: Dict[str, Dict[str, Any]] = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def extractor(eid: str, name: str, kind: str, description: str = ""):
|
|
30
|
+
def _wrap(fn):
|
|
31
|
+
RUNTIME_EXTRACTOR_REGISTRY[eid] = {
|
|
32
|
+
"fn": fn,
|
|
33
|
+
"name": name,
|
|
34
|
+
"kind": kind,
|
|
35
|
+
"description": description,
|
|
36
|
+
"inputs": list(fn.__code__.co_varnames[: fn.__code__.co_argcount]),
|
|
37
|
+
}
|
|
38
|
+
return fn
|
|
39
|
+
|
|
40
|
+
return _wrap
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def build_extract(metric_id: str, value: Any, **kw: Any) -> ExtractResult:
|
|
44
|
+
return ExtractResult(metric_id=metric_id, value=value, **kw)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def norm_date(s: str):
|
|
5
|
+
for fmt in (
|
|
6
|
+
"%Y-%m-%d",
|
|
7
|
+
"%d-%m-%Y",
|
|
8
|
+
"%m/%d/%Y",
|
|
9
|
+
"%d/%m/%Y",
|
|
10
|
+
"%b %d, %Y",
|
|
11
|
+
"%B %d, %Y",
|
|
12
|
+
):
|
|
13
|
+
try:
|
|
14
|
+
return datetime.strptime(str(s).strip(), fmt).date().isoformat()
|
|
15
|
+
except ValueError:
|
|
16
|
+
continue
|
|
17
|
+
return None
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
_NOISE_PATTERNS = [
|
|
5
|
+
r"\bnote\s*\d+\b",
|
|
6
|
+
r"\bsee\s+note\b",
|
|
7
|
+
r"\bunaudited\b",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _strip_noise(text: str) -> str:
|
|
12
|
+
out = text
|
|
13
|
+
for pat in _NOISE_PATTERNS:
|
|
14
|
+
out = re.sub(pat, "", out, flags=re.IGNORECASE)
|
|
15
|
+
return out.strip()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def norm_number(s):
|
|
19
|
+
"""Parse a financial number safely, return None on ambiguous/junk values."""
|
|
20
|
+
if s is None:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
text = _strip_noise(str(s).strip())
|
|
24
|
+
if not text:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
# Reject likely sentence-like content.
|
|
28
|
+
if len(text) > 40 and re.search(r"[A-Za-z]{3,}", text):
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
neg = text.startswith("(") and text.endswith(")")
|
|
32
|
+
text = text.strip("()").strip()
|
|
33
|
+
|
|
34
|
+
# Common artifacts.
|
|
35
|
+
text = text.replace("$", "").replace("€", "").replace("£", "")
|
|
36
|
+
text = text.replace("%", "").replace("\u00a0", " ").replace(" ", "")
|
|
37
|
+
|
|
38
|
+
# Keep only the first numeric token if separated by delimiters.
|
|
39
|
+
token_match = re.search(r"[-+]?\d[\d,]*(?:\.\d+)?", text)
|
|
40
|
+
if not token_match:
|
|
41
|
+
return None
|
|
42
|
+
token = token_match.group(0)
|
|
43
|
+
|
|
44
|
+
# Handle 1.234,56 style commas if needed.
|
|
45
|
+
if token.count(",") == 1 and token.count(".") >= 1 and token.rfind(",") > token.rfind("."):
|
|
46
|
+
token = token.replace(".", "").replace(",", ".")
|
|
47
|
+
else:
|
|
48
|
+
token = token.replace(",", "")
|
|
49
|
+
|
|
50
|
+
if not re.fullmatch(r"[-+]?\d+(?:\.\d+)?", token):
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
val = float(token)
|
|
54
|
+
if neg and val > 0:
|
|
55
|
+
val = -val
|
|
56
|
+
return val
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
def norm_period(col_header: str):
|
|
2
|
+
text = str(col_header or "").upper().strip()
|
|
3
|
+
if not text:
|
|
4
|
+
return ""
|
|
5
|
+
if "TTM" in text or "TRAILING TWELVE MONTHS" in text:
|
|
6
|
+
return "TTM"
|
|
7
|
+
for q in ("Q1", "Q2", "Q3", "Q4"):
|
|
8
|
+
if q in text:
|
|
9
|
+
return q
|
|
10
|
+
if "FY" in text:
|
|
11
|
+
return text
|
|
12
|
+
if any(str(y) in text for y in range(1990, 2101)):
|
|
13
|
+
return text
|
|
14
|
+
return ""
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
def norm_scale(value, units_note: str = ""):
|
|
2
|
+
note = (units_note or "").lower().strip()
|
|
3
|
+
mult = 1.0
|
|
4
|
+
if "billion" in note or note.endswith("bn") or " in b " in f" {note} ":
|
|
5
|
+
mult = 1e9
|
|
6
|
+
elif "million" in note or note.endswith("mm") or " in m " in f" {note} ":
|
|
7
|
+
mult = 1e6
|
|
8
|
+
elif "thousand" in note or "000s" in note:
|
|
9
|
+
mult = 1e3
|
|
10
|
+
try:
|
|
11
|
+
return float(value) * mult, mult
|
|
12
|
+
except (TypeError, ValueError):
|
|
13
|
+
return value, mult
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def read_docx(docx_path: str):
|
|
5
|
+
try:
|
|
6
|
+
from docx import Document # type: ignore
|
|
7
|
+
except Exception:
|
|
8
|
+
return {"source": "docx", "path": docx_path, "paragraphs": [], "tables": [], "error": "python_docx_unavailable"}
|
|
9
|
+
|
|
10
|
+
doc = Document(docx_path)
|
|
11
|
+
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
|
|
12
|
+
tables = []
|
|
13
|
+
for ti, table in enumerate(doc.tables, start=1):
|
|
14
|
+
rows = []
|
|
15
|
+
for row in table.rows:
|
|
16
|
+
rows.append([cell.text.strip() for cell in row.cells])
|
|
17
|
+
tables.append({"table_number": ti, "rows": rows})
|
|
18
|
+
return {"source": "docx", "path": docx_path, "paragraphs": paragraphs, "tables": tables}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def read_html(html: str):
|
|
5
|
+
try:
|
|
6
|
+
from bs4 import BeautifulSoup # type: ignore
|
|
7
|
+
except Exception:
|
|
8
|
+
return {"source": "html", "tables": [], "error": "bs4_unavailable"}
|
|
9
|
+
|
|
10
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
11
|
+
tables = []
|
|
12
|
+
for ti, table in enumerate(soup.find_all("table"), start=1):
|
|
13
|
+
rows = []
|
|
14
|
+
for tr in table.find_all("tr"):
|
|
15
|
+
cells = tr.find_all(["th", "td"])
|
|
16
|
+
rows.append([c.get_text(" ", strip=True) for c in cells])
|
|
17
|
+
tables.append({"table_number": ti, "rows": rows})
|
|
18
|
+
return {"source": "html", "tables": tables}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def read_ixbrl(ixbrl_path: str):
|
|
5
|
+
try:
|
|
6
|
+
from lxml import etree # type: ignore
|
|
7
|
+
except Exception:
|
|
8
|
+
return {"source": "ixbrl", "path": ixbrl_path, "facts": [], "error": "lxml_unavailable"}
|
|
9
|
+
|
|
10
|
+
parser = etree.HTMLParser(recover=True)
|
|
11
|
+
tree = etree.parse(ixbrl_path, parser)
|
|
12
|
+
facts = []
|
|
13
|
+
|
|
14
|
+
for elem in tree.xpath("//*[contains(name(), 'nonFraction') or contains(name(), 'nonNumeric')]"):
|
|
15
|
+
name = elem.get("name") or elem.get("format") or ""
|
|
16
|
+
facts.append(
|
|
17
|
+
{
|
|
18
|
+
"name": name,
|
|
19
|
+
"contextRef": elem.get("contextRef", ""),
|
|
20
|
+
"unitRef": elem.get("unitRef", ""),
|
|
21
|
+
"value": "".join(elem.itertext()).strip(),
|
|
22
|
+
}
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
return {"source": "ixbrl", "path": ixbrl_path, "facts": facts}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def read_pdf_tables(pdf_path: str):
|
|
5
|
+
try:
|
|
6
|
+
import pdfplumber # type: ignore
|
|
7
|
+
except Exception:
|
|
8
|
+
return {"source": "pdf_table", "path": pdf_path, "cells": [], "error": "pdfplumber_unavailable"}
|
|
9
|
+
|
|
10
|
+
cells = []
|
|
11
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
12
|
+
for page_no, page in enumerate(pdf.pages, start=1):
|
|
13
|
+
tables = page.extract_tables() or []
|
|
14
|
+
for table_no, table in enumerate(tables, start=1):
|
|
15
|
+
if not table:
|
|
16
|
+
continue
|
|
17
|
+
headers = [str(c or "").strip() for c in table[0]]
|
|
18
|
+
for row in table[1:]:
|
|
19
|
+
row_header = str((row[0] if row else "") or "").strip()
|
|
20
|
+
for ci, raw_val in enumerate(row[1:], start=1):
|
|
21
|
+
cells.append(
|
|
22
|
+
{
|
|
23
|
+
"row_header": row_header,
|
|
24
|
+
"col_header": headers[ci] if ci < len(headers) else "",
|
|
25
|
+
"value": str(raw_val or "").strip(),
|
|
26
|
+
"page": page_no,
|
|
27
|
+
"table_number": table_no,
|
|
28
|
+
}
|
|
29
|
+
)
|
|
30
|
+
return {"source": "pdf_table", "path": pdf_path, "cells": cells}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def read_pdf_text(pdf_path: str):
|
|
5
|
+
try:
|
|
6
|
+
import pdfplumber # type: ignore
|
|
7
|
+
except Exception:
|
|
8
|
+
return {"source": "pdf_text", "path": pdf_path, "pages": [], "error": "pdfplumber_unavailable"}
|
|
9
|
+
|
|
10
|
+
pages = []
|
|
11
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
12
|
+
for idx, page in enumerate(pdf.pages, start=1):
|
|
13
|
+
pages.append({"page": idx, "text": page.extract_text() or ""})
|
|
14
|
+
return {"source": "pdf_text", "path": pdf_path, "pages": pages}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def read_xlsx(xlsx_path: str):
|
|
5
|
+
try:
|
|
6
|
+
from openpyxl import load_workbook # type: ignore
|
|
7
|
+
except Exception:
|
|
8
|
+
return {"source": "xlsx", "path": xlsx_path, "sheets": {}, "error": "openpyxl_unavailable"}
|
|
9
|
+
|
|
10
|
+
wb = load_workbook(xlsx_path, data_only=True, read_only=True)
|
|
11
|
+
sheets = {}
|
|
12
|
+
for ws in wb.worksheets:
|
|
13
|
+
rows = []
|
|
14
|
+
for row in ws.iter_rows(values_only=True):
|
|
15
|
+
rows.append([("" if c is None else str(c).strip()) for c in row])
|
|
16
|
+
sheets[ws.title] = rows
|
|
17
|
+
return {"source": "xlsx", "path": xlsx_path, "sheets": sheets}
|
extract_lib/register.py
ADDED
extract_lib/registry.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Extractor registry for FinBench extract_lib."""
|
|
2
|
+
|
|
3
|
+
DOMAINS = {}
|
|
4
|
+
|
|
5
|
+
DOMAINS["G01_readers"] = {
|
|
6
|
+
"title": "Document Readers",
|
|
7
|
+
"module": "readers",
|
|
8
|
+
"extractors": [
|
|
9
|
+
("read_pdf_text", "PDF Text Reader", "reader", "Layout-aware text via pdfplumber", "pdf_path"),
|
|
10
|
+
("read_pdf_tables", "PDF Table Reader", "reader", "Recover tables via Camelot/pdfplumber", "pdf_path"),
|
|
11
|
+
("read_ocr", "OCR Reader", "reader", "pytesseract fallback for scanned PDFs", "pdf_path"),
|
|
12
|
+
("read_ixbrl", "iXBRL Tag Reader", "reader", "Parse us-gaap:* tagged facts (best SEC source)", "ixbrl_path"),
|
|
13
|
+
("read_docx", "DOCX Reader", "reader", "python-docx paragraphs + tables", "docx_path"),
|
|
14
|
+
("read_xlsx", "XLSX Reader", "reader", "openpyxl sheets + cells", "xlsx_path"),
|
|
15
|
+
("read_html", "HTML Table Reader", "reader", "bs4 table extraction", "html"),
|
|
16
|
+
],
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
DOMAINS["G02_structure"] = {
|
|
20
|
+
"title": "Table Structure",
|
|
21
|
+
"module": "structure",
|
|
22
|
+
"extractors": [
|
|
23
|
+
("detect_header_row", "Header Row Detector", "structure", "Find which row holds column headers", "rows"),
|
|
24
|
+
("detect_units_note", "Units Note Detector", "structure", "Find 'in millions/thousands' caption", "page_text"),
|
|
25
|
+
("cluster_cells", "Cell Clusterer", "structure", "Group fragmented text into cells", "words"),
|
|
26
|
+
("link_row_col", "Row-Col Linker", "structure", "Attach row+col headers to each value", "grid"),
|
|
27
|
+
],
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
DOMAINS["G03_normalize"] = {
|
|
31
|
+
"title": "Value Normalization",
|
|
32
|
+
"module": "normalize",
|
|
33
|
+
"extractors": [
|
|
34
|
+
("norm_number", "Number Parser", "normalize", "'34,229' -> 34229.0; reject non-numeric", "s"),
|
|
35
|
+
("norm_sign", "Sign Normalizer", "normalize", "'(1,234)' -> -1234 (accounting negatives)", "s"),
|
|
36
|
+
("norm_scale", "Scale Applier", "normalize", "Apply 'in millions' -> x1e6", "value, units_note"),
|
|
37
|
+
("norm_currency", "Currency Tagger", "normalize", "Strip $/EUR and tag currency", "s"),
|
|
38
|
+
("norm_percent", "Percent Parser", "normalize", "'12.5%' -> 0.125 or 12.5 per mode", "s"),
|
|
39
|
+
("norm_date", "Date Parser", "normalize", "Parse filing date strings", "s"),
|
|
40
|
+
("norm_period", "Period Resolver", "normalize", "Map column header -> FY/Q period", "col_header"),
|
|
41
|
+
],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
DOMAINS["G04_resolve"] = {
|
|
45
|
+
"title": "Metric Resolution",
|
|
46
|
+
"module": "resolver",
|
|
47
|
+
"extractors": [
|
|
48
|
+
("resolve_metric", "Metric Cell Resolver", "resolve", "Given metric+cells, return the ONE correct cell", "metric, cells, period"),
|
|
49
|
+
("score_cell", "Cell Scorer", "resolve", "Score how well a cell matches a metric", "metric, cell"),
|
|
50
|
+
("reject_decoy", "Decoy Rejecter", "resolve", "Drop cells matching anti-patterns", "metric, cell"),
|
|
51
|
+
],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def all_extractors():
|
|
56
|
+
out = []
|
|
57
|
+
for d in DOMAINS.values():
|
|
58
|
+
out.extend(d["extractors"])
|
|
59
|
+
return out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def count_summary():
|
|
63
|
+
return {k: len(v["extractors"]) for k, v in DOMAINS.items()}
|
extract_lib/resolver.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from .base import ExtractResult, extractor
|
|
6
|
+
from .normalize.number import norm_number
|
|
7
|
+
from .synonyms import METRIC_SYNONYMS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@extractor("reject_decoy", "Decoy Rejecter", "resolve", "Reject cells that match anti-patterns.")
|
|
11
|
+
def reject_decoy(metric: str, cell: Dict) -> bool:
|
|
12
|
+
syn = METRIC_SYNONYMS.get(metric, {})
|
|
13
|
+
rh = str(cell.get("row_header", "")).lower().strip()
|
|
14
|
+
if not rh:
|
|
15
|
+
return True
|
|
16
|
+
for bad in syn.get("anti", []):
|
|
17
|
+
if bad in rh:
|
|
18
|
+
return True
|
|
19
|
+
return False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@extractor("score_cell", "Cell Scorer", "resolve", "Score how well a cell matches a metric.")
|
|
23
|
+
def score_cell(metric: str, cell: Dict, period: str = "") -> float:
|
|
24
|
+
syn = METRIC_SYNONYMS.get(metric, {})
|
|
25
|
+
rh = str(cell.get("row_header", "")).lower().strip()
|
|
26
|
+
if not rh:
|
|
27
|
+
return 0.0
|
|
28
|
+
|
|
29
|
+
if reject_decoy(metric, cell):
|
|
30
|
+
return 0.0
|
|
31
|
+
|
|
32
|
+
score = 0.0
|
|
33
|
+
for good in syn.get("positive", []):
|
|
34
|
+
if good == rh:
|
|
35
|
+
score = max(score, 1.0)
|
|
36
|
+
elif good in rh and len(rh) < len(good) + 15:
|
|
37
|
+
score = max(score, 0.6)
|
|
38
|
+
elif good in rh:
|
|
39
|
+
score = max(score, 0.3)
|
|
40
|
+
|
|
41
|
+
if score == 0.0:
|
|
42
|
+
return 0.0
|
|
43
|
+
|
|
44
|
+
val = norm_number(str(cell.get("value", "")))
|
|
45
|
+
if val is None:
|
|
46
|
+
return 0.0
|
|
47
|
+
if abs(val) < syn.get("value_min", 0.0):
|
|
48
|
+
return 0.0
|
|
49
|
+
|
|
50
|
+
if period and period.lower() in str(cell.get("col_header", "")).lower():
|
|
51
|
+
score += 0.3
|
|
52
|
+
|
|
53
|
+
return score
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@extractor("resolve_metric", "Metric Cell Resolver", "resolve", "Return best cell for a metric.")
|
|
57
|
+
def resolve_metric(metric: str, cells: List[Dict], period: str = "") -> ExtractResult:
|
|
58
|
+
scored = []
|
|
59
|
+
for cell in cells:
|
|
60
|
+
s = score_cell(metric, cell, period)
|
|
61
|
+
if s > 0:
|
|
62
|
+
scored.append((s, cell))
|
|
63
|
+
|
|
64
|
+
if not scored:
|
|
65
|
+
return ExtractResult(metric_id=metric, value=None, valid=False, error="no cell matched", confidence=0.0)
|
|
66
|
+
|
|
67
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
68
|
+
best_score, best = scored[0]
|
|
69
|
+
if best_score < 0.5:
|
|
70
|
+
return ExtractResult(metric_id=metric, value=None, valid=False, error="low confidence", confidence=best_score)
|
|
71
|
+
val = norm_number(str(best.get("value", "")))
|
|
72
|
+
|
|
73
|
+
return ExtractResult(
|
|
74
|
+
metric_id=metric,
|
|
75
|
+
value=val,
|
|
76
|
+
raw_value=str(best.get("value", "")),
|
|
77
|
+
row_header=best.get("row_header", ""),
|
|
78
|
+
col_header=best.get("col_header", ""),
|
|
79
|
+
page=best.get("page"),
|
|
80
|
+
table_number=best.get("table_number"),
|
|
81
|
+
source="pdf_table",
|
|
82
|
+
confidence=min(best_score, 1.0),
|
|
83
|
+
valid=val is not None,
|
|
84
|
+
candidates=[{"score": s, "row": c.get("row_header", ""), "val": c.get("value", "")} for s, c in scored[:5]],
|
|
85
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
def cluster_cells(words):
|
|
2
|
+
if not words:
|
|
3
|
+
return []
|
|
4
|
+
# Stable deterministic grouping by rounded bbox buckets when available.
|
|
5
|
+
grouped = {}
|
|
6
|
+
for w in words:
|
|
7
|
+
if not isinstance(w, dict):
|
|
8
|
+
continue
|
|
9
|
+
key = (
|
|
10
|
+
int(float(w.get("top", 0)) // 8),
|
|
11
|
+
int(float(w.get("x0", 0)) // 30),
|
|
12
|
+
)
|
|
13
|
+
grouped.setdefault(key, []).append(str(w.get("text", "")).strip())
|
|
14
|
+
out = []
|
|
15
|
+
for (r, c), tokens in sorted(grouped.items()):
|
|
16
|
+
out.append({"row_bucket": r, "col_bucket": c, "text": " ".join(t for t in tokens if t)})
|
|
17
|
+
return out
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
def detect_header_row(rows):
|
|
2
|
+
if not rows:
|
|
3
|
+
return None
|
|
4
|
+
best_i = 0
|
|
5
|
+
best_score = -1
|
|
6
|
+
for i, row in enumerate(rows[:5]):
|
|
7
|
+
text_cells = sum(1 for c in row if isinstance(c, str) and c.strip())
|
|
8
|
+
numeric_cells = 0
|
|
9
|
+
for c in row:
|
|
10
|
+
s = str(c or "").strip().replace(",", "")
|
|
11
|
+
if s and s.replace(".", "", 1).replace("-", "", 1).isdigit():
|
|
12
|
+
numeric_cells += 1
|
|
13
|
+
score = text_cells - numeric_cells
|
|
14
|
+
if score > best_score:
|
|
15
|
+
best_score = score
|
|
16
|
+
best_i = i
|
|
17
|
+
return best_i
|
extract_lib/synonyms.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
METRIC_SYNONYMS = {
|
|
2
|
+
"revenue": {
|
|
3
|
+
"positive": [
|
|
4
|
+
"total revenue",
|
|
5
|
+
"net sales",
|
|
6
|
+
"total net sales",
|
|
7
|
+
"net revenue",
|
|
8
|
+
"total revenues",
|
|
9
|
+
"revenue, net",
|
|
10
|
+
],
|
|
11
|
+
"anti": [
|
|
12
|
+
"revenue-generating",
|
|
13
|
+
"revenue generating",
|
|
14
|
+
"deferred revenue",
|
|
15
|
+
"revenue recognition",
|
|
16
|
+
"unearned revenue",
|
|
17
|
+
"by segment",
|
|
18
|
+
"disaggregation",
|
|
19
|
+
"activities",
|
|
20
|
+
],
|
|
21
|
+
"value_min": 1.0,
|
|
22
|
+
},
|
|
23
|
+
"cogs": {
|
|
24
|
+
"positive": [
|
|
25
|
+
"cost of sales",
|
|
26
|
+
"cost of goods sold",
|
|
27
|
+
"cost of products sold",
|
|
28
|
+
"cost of revenue",
|
|
29
|
+
],
|
|
30
|
+
"anti": ["percentage", "% of", "ratio"],
|
|
31
|
+
"value_min": 1.0,
|
|
32
|
+
},
|
|
33
|
+
"net_income": {
|
|
34
|
+
"positive": [
|
|
35
|
+
"net income",
|
|
36
|
+
"net earnings",
|
|
37
|
+
"net income attributable to",
|
|
38
|
+
"profit for the year",
|
|
39
|
+
],
|
|
40
|
+
"anti": ["per share", "margin", "%", "comprehensive", "noncontrolling"],
|
|
41
|
+
"value_min": 0.0,
|
|
42
|
+
},
|
|
43
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from extract_lib import * # noqa: F401,F403
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fina_extractor_lib
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic financial document extraction library
|
|
5
|
+
Author: Potheesh Vignesh K
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pdfplumber>=0.11
|
|
10
|
+
Requires-Dist: lxml>=5.0
|
|
11
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
12
|
+
Requires-Dist: openpyxl>=3.1
|
|
13
|
+
Requires-Dist: python-docx>=1.1
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
16
|
+
|
|
17
|
+
# fina-extract-lib`nDeterministic financial document extraction library.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
extract_lib/__init__.py,sha256=N1Fnpqdk9uxoL_v3iKgHIN5Cjnbu0y4TKVgFujF3GUE,475
|
|
2
|
+
extract_lib/base.py,sha256=vdwCi0rYWmQYD02v-HvSUO85oQl49BCadC_z6RsNkQQ,1146
|
|
3
|
+
extract_lib/exceptions.py,sha256=GThQ6tE-3BEvOMtXkAQJ5bhLUMgiovJDmrT9s76fPXg,68
|
|
4
|
+
extract_lib/register.py,sha256=1l2XskRM5SvnS69JpgHGD1yaS7qpBZkxPUgDiN0zSYs,119
|
|
5
|
+
extract_lib/registry.py,sha256=an5HJqyByOqXXjEfu1DP5dLgtcdKhQzUvxaLCdpuW44,2945
|
|
6
|
+
extract_lib/resolver.py,sha256=h_Mz0ORL-ulRrOPRomFN4kugve4QcoWhzE77tR2fKAI,2756
|
|
7
|
+
extract_lib/synonyms.py,sha256=V4HIpJQr58xo8nflXrqXuHs8HF3mS8FqyfQQN4BFvPs,1111
|
|
8
|
+
extract_lib/normalize/__init__.py,sha256=F3cueDq1Uw1QecFY5QbvaFH5wTgH5YncEvtI270HXTI,90
|
|
9
|
+
extract_lib/normalize/currency.py,sha256=4T_oaVHdpj4byDqVO0SzUuMaK5zkKhB0WaGgX6NVuFA,253
|
|
10
|
+
extract_lib/normalize/dates.py,sha256=LZRTjMvtTgGZGh10cdBIDW-bWSAYb5hFhZXWYh6jwMo,355
|
|
11
|
+
extract_lib/normalize/number.py,sha256=aPybgBiOemmStJHDBrqiAeutN5T7Wgnp8gcVBX69bxc,1483
|
|
12
|
+
extract_lib/normalize/period.py,sha256=Sq5Tc0oObXTfyjN0EBwKblHoMdDCr7iBokp5o6E6h5A,409
|
|
13
|
+
extract_lib/normalize/scale.py,sha256=Jt9vOxhXktGZzowxhW_WDz1u3pbcDMWiNw5Xd9h5I6g,477
|
|
14
|
+
extract_lib/normalize/sign.py,sha256=fj7Z7MCqC4DQpmBsgv4c1lrhHMqqqs_SlSEXy6NCCSM,175
|
|
15
|
+
extract_lib/readers/__init__.py,sha256=Ks5T4uRpj3BubxJfGQLPhXMssOpvaS-B7mPHHerB1xg,106
|
|
16
|
+
extract_lib/readers/docx_reader.py,sha256=BL8mgkpKDK0m79RjVtZ46M3vtWOpDObPbO7Bj_RN9V4,724
|
|
17
|
+
extract_lib/readers/html_reader.py,sha256=6idomIp2hXsbLpAdUprE4YJB-GwedR51oy6GZvllyzo,632
|
|
18
|
+
extract_lib/readers/ixbrl.py,sha256=UUd9sTdgpxR47iD-3zPJ_5TQByeF2nxM1B8pTHxGgAY,838
|
|
19
|
+
extract_lib/readers/ocr.py,sha256=bWT9S7s1eqpy6oglJc7NsWjOnGG86zUQWigRFQgQhtU,182
|
|
20
|
+
extract_lib/readers/pdf_tables.py,sha256=gHnq6GU4n0_hKlJGrRB89knPX-9ou5emR7OreA8q4dA,1305
|
|
21
|
+
extract_lib/readers/pdf_text.py,sha256=987h2tpCd9tUWZJHF5VyYCyw_9YuT1IYR-zHMuSvuco,506
|
|
22
|
+
extract_lib/readers/xlsx_reader.py,sha256=0zgYR7E6FfRzhNZeSXfviVEXm9f6xtbsEOuK35lj43M,616
|
|
23
|
+
extract_lib/structure/__init__.py,sha256=mcKBHzLwpXQRghTZICsdGT6zA2JGfot0FCTrlO9q0fI,115
|
|
24
|
+
extract_lib/structure/cell_cluster.py,sha256=aZk9uPNPk-1GF3g0rJYxwOBbyg_EqVu7Q84GC7FJhQI,601
|
|
25
|
+
extract_lib/structure/header_detect.py,sha256=HCN8d1p-wKRT1LIgaGLdh9RsVUG38uQdYeJgSagAr6M,574
|
|
26
|
+
extract_lib/structure/table_model.py,sha256=B4EmAU4Cfsu64ICNeTr2obsAGr_toiwM_YSs40doW00,212
|
|
27
|
+
fina_extractor_lib/__init__.py,sha256=xPlsxDtwtFnVC1A1yY_9_YHbEHBqkX0d4WwbMvXVfpA,45
|
|
28
|
+
fina_extractor_lib-0.1.0.dist-info/METADATA,sha256=W84PX9hxwoSdbo5CQGKAcgMK2q47eykD3g25b24pt7U,543
|
|
29
|
+
fina_extractor_lib-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
30
|
+
fina_extractor_lib-0.1.0.dist-info/top_level.txt,sha256=F-3yzWVOJGtNzBPabe7WpG1UgCUBrpeGb1FPYLDnS58,31
|
|
31
|
+
fina_extractor_lib-0.1.0.dist-info/RECORD,,
|