PyPI - fina-extractor-lib - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fina-extractor-lib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

extract_lib/__init__.py +18 -0
extract_lib/base.py +44 -0
extract_lib/exceptions.py +2 -0
extract_lib/normalize/__init__.py +3 -0
extract_lib/normalize/currency.py +7 -0
extract_lib/normalize/dates.py +17 -0
extract_lib/normalize/number.py +56 -0
extract_lib/normalize/period.py +14 -0
extract_lib/normalize/scale.py +13 -0
extract_lib/normalize/sign.py +7 -0
extract_lib/readers/__init__.py +3 -0
extract_lib/readers/docx_reader.py +18 -0
extract_lib/readers/html_reader.py +18 -0
extract_lib/readers/ixbrl.py +25 -0
extract_lib/readers/ocr.py +3 -0
extract_lib/readers/pdf_tables.py +30 -0
extract_lib/readers/pdf_text.py +14 -0
extract_lib/readers/xlsx_reader.py +17 -0
extract_lib/register.py +3 -0
extract_lib/registry.py +63 -0
extract_lib/resolver.py +85 -0
extract_lib/structure/__init__.py +3 -0
extract_lib/structure/cell_cluster.py +17 -0
extract_lib/structure/header_detect.py +17 -0
extract_lib/structure/table_model.py +11 -0
extract_lib/synonyms.py +43 -0
fina_extractor_lib/__init__.py +1 -0
fina_extractor_lib-0.1.0.dist-info/METADATA +17 -0
fina_extractor_lib-0.1.0.dist-info/RECORD +31 -0
fina_extractor_lib-0.1.0.dist-info/WHEEL +5 -0
fina_extractor_lib-0.1.0.dist-info/top_level.txt +2 -0

extract_lib/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from .base import ExtractResult, RUNTIME_EXTRACTOR_REGISTRY, build_extract, extractor
+from .registry import DOMAINS, all_extractors, count_summary
+from . import resolver
+from .normalize import number, scale, sign
+from .readers import ixbrl, pdf_tables
+EXTRACTOR_REGISTRY = RUNTIME_EXTRACTOR_REGISTRY
+__all__ = [
+    "DOMAINS",
+    "all_extractors",
+    "count_summary",
+    "ExtractResult",
+    "EXTRACTOR_REGISTRY",
+    "extractor",
+    "build_extract",
+    "resolver",
+]

extract_lib/base.py ADDED Viewed

@@ -0,0 +1,44 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+@dataclass
+class ExtractResult:
+    value: Any
+    metric_id: str
+    raw_value: str = ""
+    unit: str = ""
+    scale: float = 1.0
+    period: str = ""
+    page: Optional[int] = None
+    table_number: Optional[int] = None
+    row_header: str = ""
+    col_header: str = ""
+    source: str = ""
+    confidence: float = 0.0
+    valid: bool = True
+    error: Optional[str] = None
+    candidates: List[Dict[str, Any]] = field(default_factory=list)
+RUNTIME_EXTRACTOR_REGISTRY: Dict[str, Dict[str, Any]] = {}
+def extractor(eid: str, name: str, kind: str, description: str = ""):
+    def _wrap(fn):
+        RUNTIME_EXTRACTOR_REGISTRY[eid] = {
+            "fn": fn,
+            "name": name,
+            "kind": kind,
+            "description": description,
+            "inputs": list(fn.__code__.co_varnames[: fn.__code__.co_argcount]),
+        }
+        return fn
+    return _wrap
+def build_extract(metric_id: str, value: Any, **kw: Any) -> ExtractResult:
+    return ExtractResult(metric_id=metric_id, value=value, **kw)

extract_lib/exceptions.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class ExtractError(Exception):
2	+ """Base extraction exception."""

extract_lib/normalize/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .number import norm_number
+from .sign import norm_sign
+from .scale import norm_scale

extract_lib/normalize/currency.py ADDED Viewed

@@ -0,0 +1,7 @@
+def norm_currency(s: str):
+    t = str(s or "").strip()
+    if "$" in t or "usd" in t.lower():
+        return t.replace("$", "").strip(), "USD"
+    if "€" in t or "eur" in t.lower():
+        return t.replace("€", "").strip(), "EUR"
+    return t, ""

extract_lib/normalize/dates.py ADDED Viewed

@@ -0,0 +1,17 @@
+from datetime import datetime
+def norm_date(s: str):
+    for fmt in (
+        "%Y-%m-%d",
+        "%d-%m-%Y",
+        "%m/%d/%Y",
+        "%d/%m/%Y",
+        "%b %d, %Y",
+        "%B %d, %Y",
+    ):
+        try:
+            return datetime.strptime(str(s).strip(), fmt).date().isoformat()
+        except ValueError:
+            continue
+    return None

extract_lib/normalize/number.py ADDED Viewed

@@ -0,0 +1,56 @@
+import re
+_NOISE_PATTERNS = [
+    r"\bnote\s*\d+\b",
+    r"\bsee\s+note\b",
+    r"\bunaudited\b",
+]
+def _strip_noise(text: str) -> str:
+    out = text
+    for pat in _NOISE_PATTERNS:
+        out = re.sub(pat, "", out, flags=re.IGNORECASE)
+    return out.strip()
+def norm_number(s):
+    """Parse a financial number safely, return None on ambiguous/junk values."""
+    if s is None:
+        return None
+    text = _strip_noise(str(s).strip())
+    if not text:
+        return None
+    # Reject likely sentence-like content.
+    if len(text) > 40 and re.search(r"[A-Za-z]{3,}", text):
+        return None
+    neg = text.startswith("(") and text.endswith(")")
+    text = text.strip("()").strip()
+    # Common artifacts.
+    text = text.replace("$", "").replace("€", "").replace("£", "")
+    text = text.replace("%", "").replace("\u00a0", " ").replace(" ", "")
+    # Keep only the first numeric token if separated by delimiters.
+    token_match = re.search(r"[-+]?\d[\d,]*(?:\.\d+)?", text)
+    if not token_match:
+        return None
+    token = token_match.group(0)
+    # Handle 1.234,56 style commas if needed.
+    if token.count(",") == 1 and token.count(".") >= 1 and token.rfind(",") > token.rfind("."):
+        token = token.replace(".", "").replace(",", ".")
+    else:
+        token = token.replace(",", "")
+    if not re.fullmatch(r"[-+]?\d+(?:\.\d+)?", token):
+        return None
+    val = float(token)
+    if neg and val > 0:
+        val = -val
+    return val

extract_lib/normalize/period.py ADDED Viewed

@@ -0,0 +1,14 @@
+def norm_period(col_header: str):
+    text = str(col_header or "").upper().strip()
+    if not text:
+        return ""
+    if "TTM" in text or "TRAILING TWELVE MONTHS" in text:
+        return "TTM"
+    for q in ("Q1", "Q2", "Q3", "Q4"):
+        if q in text:
+            return q
+    if "FY" in text:
+        return text
+    if any(str(y) in text for y in range(1990, 2101)):
+        return text
+    return ""

extract_lib/normalize/scale.py ADDED Viewed

@@ -0,0 +1,13 @@
+def norm_scale(value, units_note: str = ""):
+    note = (units_note or "").lower().strip()
+    mult = 1.0
+    if "billion" in note or note.endswith("bn") or " in b " in f" {note} ":
+        mult = 1e9
+    elif "million" in note or note.endswith("mm") or " in m " in f" {note} ":
+        mult = 1e6
+    elif "thousand" in note or "000s" in note:
+        mult = 1e3
+    try:
+        return float(value) * mult, mult
+    except (TypeError, ValueError):
+        return value, mult

extract_lib/normalize/sign.py ADDED Viewed

@@ -0,0 +1,7 @@
+def norm_sign(s):
+    if s is None:
+        return None
+    s = str(s).strip()
+    if s.startswith("(") and s.endswith(")"):
+        return "-" + s[1:-1].strip()
+    return s

extract_lib/readers/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .pdf_text import read_pdf_text
+from .pdf_tables import read_pdf_tables
+from .ixbrl import read_ixbrl

extract_lib/readers/docx_reader.py ADDED Viewed

@@ -0,0 +1,18 @@
+from __future__ import annotations
+def read_docx(docx_path: str):
+    try:
+        from docx import Document  # type: ignore
+    except Exception:
+        return {"source": "docx", "path": docx_path, "paragraphs": [], "tables": [], "error": "python_docx_unavailable"}
+    doc = Document(docx_path)
+    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
+    tables = []
+    for ti, table in enumerate(doc.tables, start=1):
+        rows = []
+        for row in table.rows:
+            rows.append([cell.text.strip() for cell in row.cells])
+        tables.append({"table_number": ti, "rows": rows})
+    return {"source": "docx", "path": docx_path, "paragraphs": paragraphs, "tables": tables}

extract_lib/readers/html_reader.py ADDED Viewed

@@ -0,0 +1,18 @@
+from __future__ import annotations
+def read_html(html: str):
+    try:
+        from bs4 import BeautifulSoup  # type: ignore
+    except Exception:
+        return {"source": "html", "tables": [], "error": "bs4_unavailable"}
+    soup = BeautifulSoup(html, "html.parser")
+    tables = []
+    for ti, table in enumerate(soup.find_all("table"), start=1):
+        rows = []
+        for tr in table.find_all("tr"):
+            cells = tr.find_all(["th", "td"])
+            rows.append([c.get_text(" ", strip=True) for c in cells])
+        tables.append({"table_number": ti, "rows": rows})
+    return {"source": "html", "tables": tables}

extract_lib/readers/ixbrl.py ADDED Viewed

@@ -0,0 +1,25 @@
+from __future__ import annotations
+def read_ixbrl(ixbrl_path: str):
+    try:
+        from lxml import etree  # type: ignore
+    except Exception:
+        return {"source": "ixbrl", "path": ixbrl_path, "facts": [], "error": "lxml_unavailable"}
+    parser = etree.HTMLParser(recover=True)
+    tree = etree.parse(ixbrl_path, parser)
+    facts = []
+    for elem in tree.xpath("//*[contains(name(), 'nonFraction') or contains(name(), 'nonNumeric')]"):
+        name = elem.get("name") or elem.get("format") or ""
+        facts.append(
+            {
+                "name": name,
+                "contextRef": elem.get("contextRef", ""),
+                "unitRef": elem.get("unitRef", ""),
+                "value": "".join(elem.itertext()).strip(),
+            }
+        )
+    return {"source": "ixbrl", "path": ixbrl_path, "facts": facts}

extract_lib/readers/ocr.py ADDED Viewed

@@ -0,0 +1,3 @@
+def read_ocr(pdf_path: str):
+    # Deterministic placeholder: explicit non-guessing fallback.
+    return {"source": "ocr", "path": pdf_path, "pages": [], "status": "not_configured"}

extract_lib/readers/pdf_tables.py ADDED Viewed

@@ -0,0 +1,30 @@
+from __future__ import annotations
+def read_pdf_tables(pdf_path: str):
+    try:
+        import pdfplumber  # type: ignore
+    except Exception:
+        return {"source": "pdf_table", "path": pdf_path, "cells": [], "error": "pdfplumber_unavailable"}
+    cells = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_no, page in enumerate(pdf.pages, start=1):
+            tables = page.extract_tables() or []
+            for table_no, table in enumerate(tables, start=1):
+                if not table:
+                    continue
+                headers = [str(c or "").strip() for c in table[0]]
+                for row in table[1:]:
+                    row_header = str((row[0] if row else "") or "").strip()
+                    for ci, raw_val in enumerate(row[1:], start=1):
+                        cells.append(
+                            {
+                                "row_header": row_header,
+                                "col_header": headers[ci] if ci < len(headers) else "",
+                                "value": str(raw_val or "").strip(),
+                                "page": page_no,
+                                "table_number": table_no,
+                            }
+                        )
+    return {"source": "pdf_table", "path": pdf_path, "cells": cells}

extract_lib/readers/pdf_text.py ADDED Viewed

@@ -0,0 +1,14 @@
+from __future__ import annotations
+def read_pdf_text(pdf_path: str):
+    try:
+        import pdfplumber  # type: ignore
+    except Exception:
+        return {"source": "pdf_text", "path": pdf_path, "pages": [], "error": "pdfplumber_unavailable"}
+    pages = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for idx, page in enumerate(pdf.pages, start=1):
+            pages.append({"page": idx, "text": page.extract_text() or ""})
+    return {"source": "pdf_text", "path": pdf_path, "pages": pages}

extract_lib/readers/xlsx_reader.py ADDED Viewed

@@ -0,0 +1,17 @@
+from __future__ import annotations
+def read_xlsx(xlsx_path: str):
+    try:
+        from openpyxl import load_workbook  # type: ignore
+    except Exception:
+        return {"source": "xlsx", "path": xlsx_path, "sheets": {}, "error": "openpyxl_unavailable"}
+    wb = load_workbook(xlsx_path, data_only=True, read_only=True)
+    sheets = {}
+    for ws in wb.worksheets:
+        rows = []
+        for row in ws.iter_rows(values_only=True):
+            rows.append([("" if c is None else str(c).strip()) for c in row])
+        sheets[ws.title] = rows
+    return {"source": "xlsx", "path": xlsx_path, "sheets": sheets}

extract_lib/register.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .registry import DOMAINS, all_extractors, count_summary
+__all__ = ["DOMAINS", "all_extractors", "count_summary"]

extract_lib/registry.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Extractor registry for FinBench extract_lib."""
+DOMAINS = {}
+DOMAINS["G01_readers"] = {
+    "title": "Document Readers",
+    "module": "readers",
+    "extractors": [
+        ("read_pdf_text", "PDF Text Reader", "reader", "Layout-aware text via pdfplumber", "pdf_path"),
+        ("read_pdf_tables", "PDF Table Reader", "reader", "Recover tables via Camelot/pdfplumber", "pdf_path"),
+        ("read_ocr", "OCR Reader", "reader", "pytesseract fallback for scanned PDFs", "pdf_path"),
+        ("read_ixbrl", "iXBRL Tag Reader", "reader", "Parse us-gaap:* tagged facts (best SEC source)", "ixbrl_path"),
+        ("read_docx", "DOCX Reader", "reader", "python-docx paragraphs + tables", "docx_path"),
+        ("read_xlsx", "XLSX Reader", "reader", "openpyxl sheets + cells", "xlsx_path"),
+        ("read_html", "HTML Table Reader", "reader", "bs4 table extraction", "html"),
+    ],
+}
+DOMAINS["G02_structure"] = {
+    "title": "Table Structure",
+    "module": "structure",
+    "extractors": [
+        ("detect_header_row", "Header Row Detector", "structure", "Find which row holds column headers", "rows"),
+        ("detect_units_note", "Units Note Detector", "structure", "Find 'in millions/thousands' caption", "page_text"),
+        ("cluster_cells", "Cell Clusterer", "structure", "Group fragmented text into cells", "words"),
+        ("link_row_col", "Row-Col Linker", "structure", "Attach row+col headers to each value", "grid"),
+    ],
+}
+DOMAINS["G03_normalize"] = {
+    "title": "Value Normalization",
+    "module": "normalize",
+    "extractors": [
+        ("norm_number", "Number Parser", "normalize", "'34,229' -> 34229.0; reject non-numeric", "s"),
+        ("norm_sign", "Sign Normalizer", "normalize", "'(1,234)' -> -1234 (accounting negatives)", "s"),
+        ("norm_scale", "Scale Applier", "normalize", "Apply 'in millions' -> x1e6", "value, units_note"),
+        ("norm_currency", "Currency Tagger", "normalize", "Strip $/EUR and tag currency", "s"),
+        ("norm_percent", "Percent Parser", "normalize", "'12.5%' -> 0.125 or 12.5 per mode", "s"),
+        ("norm_date", "Date Parser", "normalize", "Parse filing date strings", "s"),
+        ("norm_period", "Period Resolver", "normalize", "Map column header -> FY/Q period", "col_header"),
+    ],
+}
+DOMAINS["G04_resolve"] = {
+    "title": "Metric Resolution",
+    "module": "resolver",
+    "extractors": [
+        ("resolve_metric", "Metric Cell Resolver", "resolve", "Given metric+cells, return the ONE correct cell", "metric, cells, period"),
+        ("score_cell", "Cell Scorer", "resolve", "Score how well a cell matches a metric", "metric, cell"),
+        ("reject_decoy", "Decoy Rejecter", "resolve", "Drop cells matching anti-patterns", "metric, cell"),
+    ],
+}
+def all_extractors():
+    out = []
+    for d in DOMAINS.values():
+        out.extend(d["extractors"])
+    return out
+def count_summary():
+    return {k: len(v["extractors"]) for k, v in DOMAINS.items()}

extract_lib/resolver.py ADDED Viewed

@@ -0,0 +1,85 @@
+from __future__ import annotations
+from typing import Dict, List
+from .base import ExtractResult, extractor
+from .normalize.number import norm_number
+from .synonyms import METRIC_SYNONYMS
+@extractor("reject_decoy", "Decoy Rejecter", "resolve", "Reject cells that match anti-patterns.")
+def reject_decoy(metric: str, cell: Dict) -> bool:
+    syn = METRIC_SYNONYMS.get(metric, {})
+    rh = str(cell.get("row_header", "")).lower().strip()
+    if not rh:
+        return True
+    for bad in syn.get("anti", []):
+        if bad in rh:
+            return True
+    return False
+@extractor("score_cell", "Cell Scorer", "resolve", "Score how well a cell matches a metric.")
+def score_cell(metric: str, cell: Dict, period: str = "") -> float:
+    syn = METRIC_SYNONYMS.get(metric, {})
+    rh = str(cell.get("row_header", "")).lower().strip()
+    if not rh:
+        return 0.0
+    if reject_decoy(metric, cell):
+        return 0.0
+    score = 0.0
+    for good in syn.get("positive", []):
+        if good == rh:
+            score = max(score, 1.0)
+        elif good in rh and len(rh) < len(good) + 15:
+            score = max(score, 0.6)
+        elif good in rh:
+            score = max(score, 0.3)
+    if score == 0.0:
+        return 0.0
+    val = norm_number(str(cell.get("value", "")))
+    if val is None:
+        return 0.0
+    if abs(val) < syn.get("value_min", 0.0):
+        return 0.0
+    if period and period.lower() in str(cell.get("col_header", "")).lower():
+        score += 0.3
+    return score
+@extractor("resolve_metric", "Metric Cell Resolver", "resolve", "Return best cell for a metric.")
+def resolve_metric(metric: str, cells: List[Dict], period: str = "") -> ExtractResult:
+    scored = []
+    for cell in cells:
+        s = score_cell(metric, cell, period)
+        if s > 0:
+            scored.append((s, cell))
+    if not scored:
+        return ExtractResult(metric_id=metric, value=None, valid=False, error="no cell matched", confidence=0.0)
+    scored.sort(key=lambda x: x[0], reverse=True)
+    best_score, best = scored[0]
+    if best_score < 0.5:
+        return ExtractResult(metric_id=metric, value=None, valid=False, error="low confidence", confidence=best_score)
+    val = norm_number(str(best.get("value", "")))
+    return ExtractResult(
+        metric_id=metric,
+        value=val,
+        raw_value=str(best.get("value", "")),
+        row_header=best.get("row_header", ""),
+        col_header=best.get("col_header", ""),
+        page=best.get("page"),
+        table_number=best.get("table_number"),
+        source="pdf_table",
+        confidence=min(best_score, 1.0),
+        valid=val is not None,
+        candidates=[{"score": s, "row": c.get("row_header", ""), "val": c.get("value", "")} for s, c in scored[:5]],
+    )

extract_lib/structure/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .table_model import Cell
+from .header_detect import detect_header_row
+from .cell_cluster import cluster_cells

extract_lib/structure/cell_cluster.py ADDED Viewed

@@ -0,0 +1,17 @@
+def cluster_cells(words):
+    if not words:
+        return []
+    # Stable deterministic grouping by rounded bbox buckets when available.
+    grouped = {}
+    for w in words:
+        if not isinstance(w, dict):
+            continue
+        key = (
+            int(float(w.get("top", 0)) // 8),
+            int(float(w.get("x0", 0)) // 30),
+        )
+        grouped.setdefault(key, []).append(str(w.get("text", "")).strip())
+    out = []
+    for (r, c), tokens in sorted(grouped.items()):
+        out.append({"row_bucket": r, "col_bucket": c, "text": " ".join(t for t in tokens if t)})
+    return out

extract_lib/structure/header_detect.py ADDED Viewed

@@ -0,0 +1,17 @@
+def detect_header_row(rows):
+    if not rows:
+        return None
+    best_i = 0
+    best_score = -1
+    for i, row in enumerate(rows[:5]):
+        text_cells = sum(1 for c in row if isinstance(c, str) and c.strip())
+        numeric_cells = 0
+        for c in row:
+            s = str(c or "").strip().replace(",", "")
+            if s and s.replace(".", "", 1).replace("-", "", 1).isdigit():
+                numeric_cells += 1
+        score = text_cells - numeric_cells
+        if score > best_score:
+            best_score = score
+            best_i = i
+    return best_i

extract_lib/structure/table_model.py ADDED Viewed

@@ -0,0 +1,11 @@
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Cell:
+    row_header: str
+    col_header: str
+    value: str
+    page: Optional[int] = None
+    table_number: Optional[int] = None

extract_lib/synonyms.py ADDED Viewed

@@ -0,0 +1,43 @@
+METRIC_SYNONYMS = {
+    "revenue": {
+        "positive": [
+            "total revenue",
+            "net sales",
+            "total net sales",
+            "net revenue",
+            "total revenues",
+            "revenue, net",
+        ],
+        "anti": [
+            "revenue-generating",
+            "revenue generating",
+            "deferred revenue",
+            "revenue recognition",
+            "unearned revenue",
+            "by segment",
+            "disaggregation",
+            "activities",
+        ],
+        "value_min": 1.0,
+    },
+    "cogs": {
+        "positive": [
+            "cost of sales",
+            "cost of goods sold",
+            "cost of products sold",
+            "cost of revenue",
+        ],
+        "anti": ["percentage", "% of", "ratio"],
+        "value_min": 1.0,
+    },
+    "net_income": {
+        "positive": [
+            "net income",
+            "net earnings",
+            "net income attributable to",
+            "profit for the year",
+        ],
+        "anti": ["per share", "margin", "%", "comprehensive", "noncontrolling"],
+        "value_min": 0.0,
+    },
+}

fina_extractor_lib/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from extract_lib import * # noqa: F401,F403

fina_extractor_lib-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,17 @@
+Metadata-Version: 2.4
+Name: fina_extractor_lib
+Version: 0.1.0
+Summary: Deterministic financial document extraction library
+Author: Potheesh Vignesh K
+License: Proprietary
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: pdfplumber>=0.11
+Requires-Dist: lxml>=5.0
+Requires-Dist: beautifulsoup4>=4.12
+Requires-Dist: openpyxl>=3.1
+Requires-Dist: python-docx>=1.1
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+# fina-extract-lib`nDeterministic financial document extraction library.

fina_extractor_lib-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,31 @@
+extract_lib/__init__.py,sha256=N1Fnpqdk9uxoL_v3iKgHIN5Cjnbu0y4TKVgFujF3GUE,475
+extract_lib/base.py,sha256=vdwCi0rYWmQYD02v-HvSUO85oQl49BCadC_z6RsNkQQ,1146
+extract_lib/exceptions.py,sha256=GThQ6tE-3BEvOMtXkAQJ5bhLUMgiovJDmrT9s76fPXg,68
+extract_lib/register.py,sha256=1l2XskRM5SvnS69JpgHGD1yaS7qpBZkxPUgDiN0zSYs,119
+extract_lib/registry.py,sha256=an5HJqyByOqXXjEfu1DP5dLgtcdKhQzUvxaLCdpuW44,2945
+extract_lib/resolver.py,sha256=h_Mz0ORL-ulRrOPRomFN4kugve4QcoWhzE77tR2fKAI,2756
+extract_lib/synonyms.py,sha256=V4HIpJQr58xo8nflXrqXuHs8HF3mS8FqyfQQN4BFvPs,1111
+extract_lib/normalize/__init__.py,sha256=F3cueDq1Uw1QecFY5QbvaFH5wTgH5YncEvtI270HXTI,90
+extract_lib/normalize/currency.py,sha256=4T_oaVHdpj4byDqVO0SzUuMaK5zkKhB0WaGgX6NVuFA,253
+extract_lib/normalize/dates.py,sha256=LZRTjMvtTgGZGh10cdBIDW-bWSAYb5hFhZXWYh6jwMo,355
+extract_lib/normalize/number.py,sha256=aPybgBiOemmStJHDBrqiAeutN5T7Wgnp8gcVBX69bxc,1483
+extract_lib/normalize/period.py,sha256=Sq5Tc0oObXTfyjN0EBwKblHoMdDCr7iBokp5o6E6h5A,409
+extract_lib/normalize/scale.py,sha256=Jt9vOxhXktGZzowxhW_WDz1u3pbcDMWiNw5Xd9h5I6g,477
+extract_lib/normalize/sign.py,sha256=fj7Z7MCqC4DQpmBsgv4c1lrhHMqqqs_SlSEXy6NCCSM,175
+extract_lib/readers/__init__.py,sha256=Ks5T4uRpj3BubxJfGQLPhXMssOpvaS-B7mPHHerB1xg,106
+extract_lib/readers/docx_reader.py,sha256=BL8mgkpKDK0m79RjVtZ46M3vtWOpDObPbO7Bj_RN9V4,724
+extract_lib/readers/html_reader.py,sha256=6idomIp2hXsbLpAdUprE4YJB-GwedR51oy6GZvllyzo,632
+extract_lib/readers/ixbrl.py,sha256=UUd9sTdgpxR47iD-3zPJ_5TQByeF2nxM1B8pTHxGgAY,838
+extract_lib/readers/ocr.py,sha256=bWT9S7s1eqpy6oglJc7NsWjOnGG86zUQWigRFQgQhtU,182
+extract_lib/readers/pdf_tables.py,sha256=gHnq6GU4n0_hKlJGrRB89knPX-9ou5emR7OreA8q4dA,1305
+extract_lib/readers/pdf_text.py,sha256=987h2tpCd9tUWZJHF5VyYCyw_9YuT1IYR-zHMuSvuco,506
+extract_lib/readers/xlsx_reader.py,sha256=0zgYR7E6FfRzhNZeSXfviVEXm9f6xtbsEOuK35lj43M,616
+extract_lib/structure/__init__.py,sha256=mcKBHzLwpXQRghTZICsdGT6zA2JGfot0FCTrlO9q0fI,115
+extract_lib/structure/cell_cluster.py,sha256=aZk9uPNPk-1GF3g0rJYxwOBbyg_EqVu7Q84GC7FJhQI,601
+extract_lib/structure/header_detect.py,sha256=HCN8d1p-wKRT1LIgaGLdh9RsVUG38uQdYeJgSagAr6M,574
+extract_lib/structure/table_model.py,sha256=B4EmAU4Cfsu64ICNeTr2obsAGr_toiwM_YSs40doW00,212
+fina_extractor_lib/__init__.py,sha256=xPlsxDtwtFnVC1A1yY_9_YHbEHBqkX0d4WwbMvXVfpA,45
+fina_extractor_lib-0.1.0.dist-info/METADATA,sha256=W84PX9hxwoSdbo5CQGKAcgMK2q47eykD3g25b24pt7U,543
+fina_extractor_lib-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+fina_extractor_lib-0.1.0.dist-info/top_level.txt,sha256=F-3yzWVOJGtNzBPabe7WpG1UgCUBrpeGb1FPYLDnS58,31
+fina_extractor_lib-0.1.0.dist-info/RECORD,,

fina_extractor_lib-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

fina_extractor_lib-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ extract_lib
2	+ fina_extractor_lib