fina-extractor-lib 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. fina_extractor_lib-0.1.1/PKG-INFO +4 -0
  2. fina_extractor_lib-0.1.1/README.md +1 -0
  3. fina_extractor_lib-0.1.1/pyproject.toml +12 -0
  4. fina_extractor_lib-0.1.1/setup.cfg +4 -0
  5. fina_extractor_lib-0.1.1/src/fina_extractor_lib/__init__.py +18 -0
  6. fina_extractor_lib-0.1.1/src/fina_extractor_lib/base.py +44 -0
  7. fina_extractor_lib-0.1.1/src/fina_extractor_lib/exceptions.py +2 -0
  8. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/__init__.py +3 -0
  9. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/currency.py +7 -0
  10. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/dates.py +17 -0
  11. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/number.py +56 -0
  12. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/period.py +14 -0
  13. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/scale.py +13 -0
  14. fina_extractor_lib-0.1.1/src/fina_extractor_lib/normalize/sign.py +7 -0
  15. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/__init__.py +3 -0
  16. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/docx_reader.py +18 -0
  17. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/html_reader.py +18 -0
  18. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/ixbrl.py +25 -0
  19. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/ocr.py +3 -0
  20. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/pdf_tables.py +30 -0
  21. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/pdf_text.py +14 -0
  22. fina_extractor_lib-0.1.1/src/fina_extractor_lib/readers/xlsx_reader.py +17 -0
  23. fina_extractor_lib-0.1.1/src/fina_extractor_lib/register.py +3 -0
  24. fina_extractor_lib-0.1.1/src/fina_extractor_lib/registry.py +63 -0
  25. fina_extractor_lib-0.1.1/src/fina_extractor_lib/resolver.py +85 -0
  26. fina_extractor_lib-0.1.1/src/fina_extractor_lib/structure/__init__.py +3 -0
  27. fina_extractor_lib-0.1.1/src/fina_extractor_lib/structure/cell_cluster.py +17 -0
  28. fina_extractor_lib-0.1.1/src/fina_extractor_lib/structure/header_detect.py +17 -0
  29. fina_extractor_lib-0.1.1/src/fina_extractor_lib/structure/table_model.py +11 -0
  30. fina_extractor_lib-0.1.1/src/fina_extractor_lib/synonyms.py +43 -0
  31. fina_extractor_lib-0.1.1/src/fina_extractor_lib.egg-info/PKG-INFO +4 -0
  32. fina_extractor_lib-0.1.1/src/fina_extractor_lib.egg-info/SOURCES.txt +32 -0
  33. fina_extractor_lib-0.1.1/src/fina_extractor_lib.egg-info/dependency_links.txt +1 -0
  34. fina_extractor_lib-0.1.1/src/fina_extractor_lib.egg-info/top_level.txt +1 -0
@@ -0,0 +1,4 @@
1
+ Metadata-Version: 2.4
2
+ Name: fina_extractor_lib
3
+ Version: 0.1.1
4
+ License-Expression: MIT
@@ -0,0 +1 @@
1
+ # fina-extract-lib`nDeterministic financial document extraction library.
@@ -0,0 +1,12 @@
1
+ [project]
2
+ name = "fina_extractor_lib"
3
+ version = "0.1.1"
4
+ # ... keep your other project fields (description, authors, dependencies) ...
5
+
6
+ # 1. FIX: Replace the old [project.license] table with this single string
7
+ license = "MIT"
8
+
9
+ # 2. FIX: Add this block at the very bottom of your pyproject.toml file
10
+ [tool.setuptools.packages.find]
11
+ where = ["src"]
12
+ include = ["fina_extractor_lib*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,18 @@
1
+ from .base import ExtractResult, RUNTIME_EXTRACTOR_REGISTRY, build_extract, extractor
2
+ from .registry import DOMAINS, all_extractors, count_summary
3
+ from . import resolver
4
+ from .normalize import number, scale, sign
5
+ from .readers import ixbrl, pdf_tables
6
+
7
+ EXTRACTOR_REGISTRY = RUNTIME_EXTRACTOR_REGISTRY
8
+
9
+ __all__ = [
10
+ "DOMAINS",
11
+ "all_extractors",
12
+ "count_summary",
13
+ "ExtractResult",
14
+ "EXTRACTOR_REGISTRY",
15
+ "extractor",
16
+ "build_extract",
17
+ "resolver",
18
+ ]
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Optional
5
+
6
+
7
+ @dataclass
8
+ class ExtractResult:
9
+ value: Any
10
+ metric_id: str
11
+ raw_value: str = ""
12
+ unit: str = ""
13
+ scale: float = 1.0
14
+ period: str = ""
15
+ page: Optional[int] = None
16
+ table_number: Optional[int] = None
17
+ row_header: str = ""
18
+ col_header: str = ""
19
+ source: str = ""
20
+ confidence: float = 0.0
21
+ valid: bool = True
22
+ error: Optional[str] = None
23
+ candidates: List[Dict[str, Any]] = field(default_factory=list)
24
+
25
+
26
+ RUNTIME_EXTRACTOR_REGISTRY: Dict[str, Dict[str, Any]] = {}
27
+
28
+
29
+ def extractor(eid: str, name: str, kind: str, description: str = ""):
30
+ def _wrap(fn):
31
+ RUNTIME_EXTRACTOR_REGISTRY[eid] = {
32
+ "fn": fn,
33
+ "name": name,
34
+ "kind": kind,
35
+ "description": description,
36
+ "inputs": list(fn.__code__.co_varnames[: fn.__code__.co_argcount]),
37
+ }
38
+ return fn
39
+
40
+ return _wrap
41
+
42
+
43
+ def build_extract(metric_id: str, value: Any, **kw: Any) -> ExtractResult:
44
+ return ExtractResult(metric_id=metric_id, value=value, **kw)
@@ -0,0 +1,2 @@
1
+ class ExtractError(Exception):
2
+ """Base extraction exception."""
@@ -0,0 +1,3 @@
1
+ from .number import norm_number
2
+ from .sign import norm_sign
3
+ from .scale import norm_scale
@@ -0,0 +1,7 @@
1
+ def norm_currency(s: str):
2
+ t = str(s or "").strip()
3
+ if "$" in t or "usd" in t.lower():
4
+ return t.replace("$", "").strip(), "USD"
5
+ if "€" in t or "eur" in t.lower():
6
+ return t.replace("€", "").strip(), "EUR"
7
+ return t, ""
@@ -0,0 +1,17 @@
1
+ from datetime import datetime
2
+
3
+
4
+ def norm_date(s: str):
5
+ for fmt in (
6
+ "%Y-%m-%d",
7
+ "%d-%m-%Y",
8
+ "%m/%d/%Y",
9
+ "%d/%m/%Y",
10
+ "%b %d, %Y",
11
+ "%B %d, %Y",
12
+ ):
13
+ try:
14
+ return datetime.strptime(str(s).strip(), fmt).date().isoformat()
15
+ except ValueError:
16
+ continue
17
+ return None
@@ -0,0 +1,56 @@
1
+ import re
2
+
3
+
4
+ _NOISE_PATTERNS = [
5
+ r"\bnote\s*\d+\b",
6
+ r"\bsee\s+note\b",
7
+ r"\bunaudited\b",
8
+ ]
9
+
10
+
11
+ def _strip_noise(text: str) -> str:
12
+ out = text
13
+ for pat in _NOISE_PATTERNS:
14
+ out = re.sub(pat, "", out, flags=re.IGNORECASE)
15
+ return out.strip()
16
+
17
+
18
+ def norm_number(s):
19
+ """Parse a financial number safely, return None on ambiguous/junk values."""
20
+ if s is None:
21
+ return None
22
+
23
+ text = _strip_noise(str(s).strip())
24
+ if not text:
25
+ return None
26
+
27
+ # Reject likely sentence-like content.
28
+ if len(text) > 40 and re.search(r"[A-Za-z]{3,}", text):
29
+ return None
30
+
31
+ neg = text.startswith("(") and text.endswith(")")
32
+ text = text.strip("()").strip()
33
+
34
+ # Common artifacts.
35
+ text = text.replace("$", "").replace("€", "").replace("£", "")
36
+ text = text.replace("%", "").replace("\u00a0", " ").replace(" ", "")
37
+
38
+ # Keep only the first numeric token if separated by delimiters.
39
+ token_match = re.search(r"[-+]?\d[\d,]*(?:\.\d+)?", text)
40
+ if not token_match:
41
+ return None
42
+ token = token_match.group(0)
43
+
44
+ # Handle 1.234,56 style commas if needed.
45
+ if token.count(",") == 1 and token.count(".") >= 1 and token.rfind(",") > token.rfind("."):
46
+ token = token.replace(".", "").replace(",", ".")
47
+ else:
48
+ token = token.replace(",", "")
49
+
50
+ if not re.fullmatch(r"[-+]?\d+(?:\.\d+)?", token):
51
+ return None
52
+
53
+ val = float(token)
54
+ if neg and val > 0:
55
+ val = -val
56
+ return val
@@ -0,0 +1,14 @@
1
+ def norm_period(col_header: str):
2
+ text = str(col_header or "").upper().strip()
3
+ if not text:
4
+ return ""
5
+ if "TTM" in text or "TRAILING TWELVE MONTHS" in text:
6
+ return "TTM"
7
+ for q in ("Q1", "Q2", "Q3", "Q4"):
8
+ if q in text:
9
+ return q
10
+ if "FY" in text:
11
+ return text
12
+ if any(str(y) in text for y in range(1990, 2101)):
13
+ return text
14
+ return ""
@@ -0,0 +1,13 @@
1
+ def norm_scale(value, units_note: str = ""):
2
+ note = (units_note or "").lower().strip()
3
+ mult = 1.0
4
+ if "billion" in note or note.endswith("bn") or " in b " in f" {note} ":
5
+ mult = 1e9
6
+ elif "million" in note or note.endswith("mm") or " in m " in f" {note} ":
7
+ mult = 1e6
8
+ elif "thousand" in note or "000s" in note:
9
+ mult = 1e3
10
+ try:
11
+ return float(value) * mult, mult
12
+ except (TypeError, ValueError):
13
+ return value, mult
@@ -0,0 +1,7 @@
1
+ def norm_sign(s):
2
+ if s is None:
3
+ return None
4
+ s = str(s).strip()
5
+ if s.startswith("(") and s.endswith(")"):
6
+ return "-" + s[1:-1].strip()
7
+ return s
@@ -0,0 +1,3 @@
1
+ from .pdf_text import read_pdf_text
2
+ from .pdf_tables import read_pdf_tables
3
+ from .ixbrl import read_ixbrl
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def read_docx(docx_path: str):
5
+ try:
6
+ from docx import Document # type: ignore
7
+ except Exception:
8
+ return {"source": "docx", "path": docx_path, "paragraphs": [], "tables": [], "error": "python_docx_unavailable"}
9
+
10
+ doc = Document(docx_path)
11
+ paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
12
+ tables = []
13
+ for ti, table in enumerate(doc.tables, start=1):
14
+ rows = []
15
+ for row in table.rows:
16
+ rows.append([cell.text.strip() for cell in row.cells])
17
+ tables.append({"table_number": ti, "rows": rows})
18
+ return {"source": "docx", "path": docx_path, "paragraphs": paragraphs, "tables": tables}
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def read_html(html: str):
5
+ try:
6
+ from bs4 import BeautifulSoup # type: ignore
7
+ except Exception:
8
+ return {"source": "html", "tables": [], "error": "bs4_unavailable"}
9
+
10
+ soup = BeautifulSoup(html, "html.parser")
11
+ tables = []
12
+ for ti, table in enumerate(soup.find_all("table"), start=1):
13
+ rows = []
14
+ for tr in table.find_all("tr"):
15
+ cells = tr.find_all(["th", "td"])
16
+ rows.append([c.get_text(" ", strip=True) for c in cells])
17
+ tables.append({"table_number": ti, "rows": rows})
18
+ return {"source": "html", "tables": tables}
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def read_ixbrl(ixbrl_path: str):
5
+ try:
6
+ from lxml import etree # type: ignore
7
+ except Exception:
8
+ return {"source": "ixbrl", "path": ixbrl_path, "facts": [], "error": "lxml_unavailable"}
9
+
10
+ parser = etree.HTMLParser(recover=True)
11
+ tree = etree.parse(ixbrl_path, parser)
12
+ facts = []
13
+
14
+ for elem in tree.xpath("//*[contains(name(), 'nonFraction') or contains(name(), 'nonNumeric')]"):
15
+ name = elem.get("name") or elem.get("format") or ""
16
+ facts.append(
17
+ {
18
+ "name": name,
19
+ "contextRef": elem.get("contextRef", ""),
20
+ "unitRef": elem.get("unitRef", ""),
21
+ "value": "".join(elem.itertext()).strip(),
22
+ }
23
+ )
24
+
25
+ return {"source": "ixbrl", "path": ixbrl_path, "facts": facts}
@@ -0,0 +1,3 @@
1
+ def read_ocr(pdf_path: str):
2
+ # Deterministic placeholder: explicit non-guessing fallback.
3
+ return {"source": "ocr", "path": pdf_path, "pages": [], "status": "not_configured"}
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def read_pdf_tables(pdf_path: str):
5
+ try:
6
+ import pdfplumber # type: ignore
7
+ except Exception:
8
+ return {"source": "pdf_table", "path": pdf_path, "cells": [], "error": "pdfplumber_unavailable"}
9
+
10
+ cells = []
11
+ with pdfplumber.open(pdf_path) as pdf:
12
+ for page_no, page in enumerate(pdf.pages, start=1):
13
+ tables = page.extract_tables() or []
14
+ for table_no, table in enumerate(tables, start=1):
15
+ if not table:
16
+ continue
17
+ headers = [str(c or "").strip() for c in table[0]]
18
+ for row in table[1:]:
19
+ row_header = str((row[0] if row else "") or "").strip()
20
+ for ci, raw_val in enumerate(row[1:], start=1):
21
+ cells.append(
22
+ {
23
+ "row_header": row_header,
24
+ "col_header": headers[ci] if ci < len(headers) else "",
25
+ "value": str(raw_val or "").strip(),
26
+ "page": page_no,
27
+ "table_number": table_no,
28
+ }
29
+ )
30
+ return {"source": "pdf_table", "path": pdf_path, "cells": cells}
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def read_pdf_text(pdf_path: str):
5
+ try:
6
+ import pdfplumber # type: ignore
7
+ except Exception:
8
+ return {"source": "pdf_text", "path": pdf_path, "pages": [], "error": "pdfplumber_unavailable"}
9
+
10
+ pages = []
11
+ with pdfplumber.open(pdf_path) as pdf:
12
+ for idx, page in enumerate(pdf.pages, start=1):
13
+ pages.append({"page": idx, "text": page.extract_text() or ""})
14
+ return {"source": "pdf_text", "path": pdf_path, "pages": pages}
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def read_xlsx(xlsx_path: str):
5
+ try:
6
+ from openpyxl import load_workbook # type: ignore
7
+ except Exception:
8
+ return {"source": "xlsx", "path": xlsx_path, "sheets": {}, "error": "openpyxl_unavailable"}
9
+
10
+ wb = load_workbook(xlsx_path, data_only=True, read_only=True)
11
+ sheets = {}
12
+ for ws in wb.worksheets:
13
+ rows = []
14
+ for row in ws.iter_rows(values_only=True):
15
+ rows.append([("" if c is None else str(c).strip()) for c in row])
16
+ sheets[ws.title] = rows
17
+ return {"source": "xlsx", "path": xlsx_path, "sheets": sheets}
@@ -0,0 +1,3 @@
1
+ from .registry import DOMAINS, all_extractors, count_summary
2
+
3
+ __all__ = ["DOMAINS", "all_extractors", "count_summary"]
@@ -0,0 +1,63 @@
1
+ """Extractor registry for FinBench extract_lib."""
2
+
3
+ DOMAINS = {}
4
+
5
+ DOMAINS["G01_readers"] = {
6
+ "title": "Document Readers",
7
+ "module": "readers",
8
+ "extractors": [
9
+ ("read_pdf_text", "PDF Text Reader", "reader", "Layout-aware text via pdfplumber", "pdf_path"),
10
+ ("read_pdf_tables", "PDF Table Reader", "reader", "Recover tables via Camelot/pdfplumber", "pdf_path"),
11
+ ("read_ocr", "OCR Reader", "reader", "pytesseract fallback for scanned PDFs", "pdf_path"),
12
+ ("read_ixbrl", "iXBRL Tag Reader", "reader", "Parse us-gaap:* tagged facts (best SEC source)", "ixbrl_path"),
13
+ ("read_docx", "DOCX Reader", "reader", "python-docx paragraphs + tables", "docx_path"),
14
+ ("read_xlsx", "XLSX Reader", "reader", "openpyxl sheets + cells", "xlsx_path"),
15
+ ("read_html", "HTML Table Reader", "reader", "bs4 table extraction", "html"),
16
+ ],
17
+ }
18
+
19
+ DOMAINS["G02_structure"] = {
20
+ "title": "Table Structure",
21
+ "module": "structure",
22
+ "extractors": [
23
+ ("detect_header_row", "Header Row Detector", "structure", "Find which row holds column headers", "rows"),
24
+ ("detect_units_note", "Units Note Detector", "structure", "Find 'in millions/thousands' caption", "page_text"),
25
+ ("cluster_cells", "Cell Clusterer", "structure", "Group fragmented text into cells", "words"),
26
+ ("link_row_col", "Row-Col Linker", "structure", "Attach row+col headers to each value", "grid"),
27
+ ],
28
+ }
29
+
30
+ DOMAINS["G03_normalize"] = {
31
+ "title": "Value Normalization",
32
+ "module": "normalize",
33
+ "extractors": [
34
+ ("norm_number", "Number Parser", "normalize", "'34,229' -> 34229.0; reject non-numeric", "s"),
35
+ ("norm_sign", "Sign Normalizer", "normalize", "'(1,234)' -> -1234 (accounting negatives)", "s"),
36
+ ("norm_scale", "Scale Applier", "normalize", "Apply 'in millions' -> x1e6", "value, units_note"),
37
+ ("norm_currency", "Currency Tagger", "normalize", "Strip $/EUR and tag currency", "s"),
38
+ ("norm_percent", "Percent Parser", "normalize", "'12.5%' -> 0.125 or 12.5 per mode", "s"),
39
+ ("norm_date", "Date Parser", "normalize", "Parse filing date strings", "s"),
40
+ ("norm_period", "Period Resolver", "normalize", "Map column header -> FY/Q period", "col_header"),
41
+ ],
42
+ }
43
+
44
+ DOMAINS["G04_resolve"] = {
45
+ "title": "Metric Resolution",
46
+ "module": "resolver",
47
+ "extractors": [
48
+ ("resolve_metric", "Metric Cell Resolver", "resolve", "Given metric+cells, return the ONE correct cell", "metric, cells, period"),
49
+ ("score_cell", "Cell Scorer", "resolve", "Score how well a cell matches a metric", "metric, cell"),
50
+ ("reject_decoy", "Decoy Rejecter", "resolve", "Drop cells matching anti-patterns", "metric, cell"),
51
+ ],
52
+ }
53
+
54
+
55
+ def all_extractors():
56
+ out = []
57
+ for d in DOMAINS.values():
58
+ out.extend(d["extractors"])
59
+ return out
60
+
61
+
62
+ def count_summary():
63
+ return {k: len(v["extractors"]) for k, v in DOMAINS.items()}
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List
4
+
5
+ from .base import ExtractResult, extractor
6
+ from .normalize.number import norm_number
7
+ from .synonyms import METRIC_SYNONYMS
8
+
9
+
10
+ @extractor("reject_decoy", "Decoy Rejecter", "resolve", "Reject cells that match anti-patterns.")
11
+ def reject_decoy(metric: str, cell: Dict) -> bool:
12
+ syn = METRIC_SYNONYMS.get(metric, {})
13
+ rh = str(cell.get("row_header", "")).lower().strip()
14
+ if not rh:
15
+ return True
16
+ for bad in syn.get("anti", []):
17
+ if bad in rh:
18
+ return True
19
+ return False
20
+
21
+
22
+ @extractor("score_cell", "Cell Scorer", "resolve", "Score how well a cell matches a metric.")
23
+ def score_cell(metric: str, cell: Dict, period: str = "") -> float:
24
+ syn = METRIC_SYNONYMS.get(metric, {})
25
+ rh = str(cell.get("row_header", "")).lower().strip()
26
+ if not rh:
27
+ return 0.0
28
+
29
+ if reject_decoy(metric, cell):
30
+ return 0.0
31
+
32
+ score = 0.0
33
+ for good in syn.get("positive", []):
34
+ if good == rh:
35
+ score = max(score, 1.0)
36
+ elif good in rh and len(rh) < len(good) + 15:
37
+ score = max(score, 0.6)
38
+ elif good in rh:
39
+ score = max(score, 0.3)
40
+
41
+ if score == 0.0:
42
+ return 0.0
43
+
44
+ val = norm_number(str(cell.get("value", "")))
45
+ if val is None:
46
+ return 0.0
47
+ if abs(val) < syn.get("value_min", 0.0):
48
+ return 0.0
49
+
50
+ if period and period.lower() in str(cell.get("col_header", "")).lower():
51
+ score += 0.3
52
+
53
+ return score
54
+
55
+
56
+ @extractor("resolve_metric", "Metric Cell Resolver", "resolve", "Return best cell for a metric.")
57
+ def resolve_metric(metric: str, cells: List[Dict], period: str = "") -> ExtractResult:
58
+ scored = []
59
+ for cell in cells:
60
+ s = score_cell(metric, cell, period)
61
+ if s > 0:
62
+ scored.append((s, cell))
63
+
64
+ if not scored:
65
+ return ExtractResult(metric_id=metric, value=None, valid=False, error="no cell matched", confidence=0.0)
66
+
67
+ scored.sort(key=lambda x: x[0], reverse=True)
68
+ best_score, best = scored[0]
69
+ if best_score < 0.5:
70
+ return ExtractResult(metric_id=metric, value=None, valid=False, error="low confidence", confidence=best_score)
71
+ val = norm_number(str(best.get("value", "")))
72
+
73
+ return ExtractResult(
74
+ metric_id=metric,
75
+ value=val,
76
+ raw_value=str(best.get("value", "")),
77
+ row_header=best.get("row_header", ""),
78
+ col_header=best.get("col_header", ""),
79
+ page=best.get("page"),
80
+ table_number=best.get("table_number"),
81
+ source="pdf_table",
82
+ confidence=min(best_score, 1.0),
83
+ valid=val is not None,
84
+ candidates=[{"score": s, "row": c.get("row_header", ""), "val": c.get("value", "")} for s, c in scored[:5]],
85
+ )
@@ -0,0 +1,3 @@
1
+ from .table_model import Cell
2
+ from .header_detect import detect_header_row
3
+ from .cell_cluster import cluster_cells
@@ -0,0 +1,17 @@
1
+ def cluster_cells(words):
2
+ if not words:
3
+ return []
4
+ # Stable deterministic grouping by rounded bbox buckets when available.
5
+ grouped = {}
6
+ for w in words:
7
+ if not isinstance(w, dict):
8
+ continue
9
+ key = (
10
+ int(float(w.get("top", 0)) // 8),
11
+ int(float(w.get("x0", 0)) // 30),
12
+ )
13
+ grouped.setdefault(key, []).append(str(w.get("text", "")).strip())
14
+ out = []
15
+ for (r, c), tokens in sorted(grouped.items()):
16
+ out.append({"row_bucket": r, "col_bucket": c, "text": " ".join(t for t in tokens if t)})
17
+ return out
@@ -0,0 +1,17 @@
1
+ def detect_header_row(rows):
2
+ if not rows:
3
+ return None
4
+ best_i = 0
5
+ best_score = -1
6
+ for i, row in enumerate(rows[:5]):
7
+ text_cells = sum(1 for c in row if isinstance(c, str) and c.strip())
8
+ numeric_cells = 0
9
+ for c in row:
10
+ s = str(c or "").strip().replace(",", "")
11
+ if s and s.replace(".", "", 1).replace("-", "", 1).isdigit():
12
+ numeric_cells += 1
13
+ score = text_cells - numeric_cells
14
+ if score > best_score:
15
+ best_score = score
16
+ best_i = i
17
+ return best_i
@@ -0,0 +1,11 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class Cell:
7
+ row_header: str
8
+ col_header: str
9
+ value: str
10
+ page: Optional[int] = None
11
+ table_number: Optional[int] = None
@@ -0,0 +1,43 @@
1
+ METRIC_SYNONYMS = {
2
+ "revenue": {
3
+ "positive": [
4
+ "total revenue",
5
+ "net sales",
6
+ "total net sales",
7
+ "net revenue",
8
+ "total revenues",
9
+ "revenue, net",
10
+ ],
11
+ "anti": [
12
+ "revenue-generating",
13
+ "revenue generating",
14
+ "deferred revenue",
15
+ "revenue recognition",
16
+ "unearned revenue",
17
+ "by segment",
18
+ "disaggregation",
19
+ "activities",
20
+ ],
21
+ "value_min": 1.0,
22
+ },
23
+ "cogs": {
24
+ "positive": [
25
+ "cost of sales",
26
+ "cost of goods sold",
27
+ "cost of products sold",
28
+ "cost of revenue",
29
+ ],
30
+ "anti": ["percentage", "% of", "ratio"],
31
+ "value_min": 1.0,
32
+ },
33
+ "net_income": {
34
+ "positive": [
35
+ "net income",
36
+ "net earnings",
37
+ "net income attributable to",
38
+ "profit for the year",
39
+ ],
40
+ "anti": ["per share", "margin", "%", "comprehensive", "noncontrolling"],
41
+ "value_min": 0.0,
42
+ },
43
+ }
@@ -0,0 +1,4 @@
1
+ Metadata-Version: 2.4
2
+ Name: fina_extractor_lib
3
+ Version: 0.1.1
4
+ License-Expression: MIT
@@ -0,0 +1,32 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/fina_extractor_lib/__init__.py
4
+ src/fina_extractor_lib/base.py
5
+ src/fina_extractor_lib/exceptions.py
6
+ src/fina_extractor_lib/register.py
7
+ src/fina_extractor_lib/registry.py
8
+ src/fina_extractor_lib/resolver.py
9
+ src/fina_extractor_lib/synonyms.py
10
+ src/fina_extractor_lib.egg-info/PKG-INFO
11
+ src/fina_extractor_lib.egg-info/SOURCES.txt
12
+ src/fina_extractor_lib.egg-info/dependency_links.txt
13
+ src/fina_extractor_lib.egg-info/top_level.txt
14
+ src/fina_extractor_lib/normalize/__init__.py
15
+ src/fina_extractor_lib/normalize/currency.py
16
+ src/fina_extractor_lib/normalize/dates.py
17
+ src/fina_extractor_lib/normalize/number.py
18
+ src/fina_extractor_lib/normalize/period.py
19
+ src/fina_extractor_lib/normalize/scale.py
20
+ src/fina_extractor_lib/normalize/sign.py
21
+ src/fina_extractor_lib/readers/__init__.py
22
+ src/fina_extractor_lib/readers/docx_reader.py
23
+ src/fina_extractor_lib/readers/html_reader.py
24
+ src/fina_extractor_lib/readers/ixbrl.py
25
+ src/fina_extractor_lib/readers/ocr.py
26
+ src/fina_extractor_lib/readers/pdf_tables.py
27
+ src/fina_extractor_lib/readers/pdf_text.py
28
+ src/fina_extractor_lib/readers/xlsx_reader.py
29
+ src/fina_extractor_lib/structure/__init__.py
30
+ src/fina_extractor_lib/structure/cell_cluster.py
31
+ src/fina_extractor_lib/structure/header_detect.py
32
+ src/fina_extractor_lib/structure/table_model.py
@@ -0,0 +1 @@
1
+ fina_extractor_lib