PyPI - teklia-layout-reader - Versions diffs - 0.2.1__py3-none-any.whl - Mend

teklia-layout-reader 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

layout_reader/__init__.py +9 -0
layout_reader/cli.py +27 -0
layout_reader/datasets/__init__.py +99 -0
layout_reader/datasets/analyze.py +161 -0
layout_reader/datasets/extract.py +289 -0
layout_reader/datasets/lsd.py +133 -0
layout_reader/datasets/utils.py +128 -0
layout_reader/helpers.py +358 -0
layout_reader/inference.py +215 -0
layout_reader/train/sft.py +69 -0
teklia_layout_reader-0.2.1.dist-info/METADATA +62 -0
teklia_layout_reader-0.2.1.dist-info/RECORD +22 -0
teklia_layout_reader-0.2.1.dist-info/WHEEL +5 -0
teklia_layout_reader-0.2.1.dist-info/entry_points.txt +2 -0
teklia_layout_reader-0.2.1.dist-info/top_level.txt +2 -0
tests/__init__.py +3 -0
tests/conftest.py +19 -0
tests/test_analyze.py +14 -0
tests/test_cli.py +11 -0
tests/test_extract.py +130 -0
tests/test_helpers.py +438 -0
tests/test_predict.py +64 -0

layout_reader/datasets/lsd.py ADDED Viewed

@@ -0,0 +1,133 @@
+from pathlib import Path
+import cv2
+import numpy as np
+from matplotlib import pyplot as plt
+from layout_reader.datasets.utils import check_is_valid_bbx, make_bbx_valid, resize_bbx
+class LineDetector:
+    def __init__(
+        self,
+        process_size: tuple[int] = (2000, 1000),
+        target_size: tuple[int] = (1000, 1000),
+        filter_ratio: tuple[float] = (0.05, 0.05),
+    ):
+        """
+        Initialize Line Detector with filtering parameters
+        Args:
+            min_line_height: Minimum height distance of detected lines
+            min_line_width: Minimum width distance of detected lines
+            angle_tolerance: Angle tolerance for merging collinear lines (degrees)
+            merge_distance: Maximum distance between lines to consider merging
+        """
+        self.process_width, self.process_height = process_size
+        self.target_width, self.target_height = target_size
+        self.filter_width_ratio, self.filter_height_ratio = filter_ratio
+        # Create LSD detector
+        self.lsd = cv2.createLineSegmentDetector(cv2.LSD_REFINE_STD)
+    def calculate_line_properties(self, line: np.ndarray) -> tuple[float, float, float]:
+        """
+        Calculate line properties: length, angle, and center
+        Args:
+            line: Line coordinates [x1, y1, x2, y2]
+        Returns:
+            Tuple of (length, angle_degrees, center_x, center_y)
+        """
+        x1, y1, x2, y2 = line
+        # Calculate length
+        height = y2 - y1
+        width = x2 - x1
+        # Calculate angle (in degrees)
+        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+        return height, width, angle
+    def filter_lines(
+        self, lines: np.ndarray, image_height: int, image_width: int
+    ) -> np.ndarray:
+        """
+        Filter lines based on length and other criteria
+        Args:
+            lines: Array of line coordinates
+        Returns:
+            Filtered lines array
+        """
+        if len(lines) == 0:
+            return lines
+        filtered_lines = []
+        for line in lines:
+            height, width, angle = self.calculate_line_properties(line)
+            # Filter by minimum length
+            if (
+                (height > width and height > self.filter_height_ratio * image_height)
+                or (width >= height and width > self.filter_width_ratio * image_width)
+            ) and (abs(angle) < 10 or abs(angle) > 80):
+                filtered_lines.append(line)
+        return filtered_lines
+    def visualize(self, image, lines, output_path="results.jpg"):
+        vis_image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        # Draw lines
+        for line in lines:
+            x1, y1, x2, y2 = line
+            cv2.line(vis_image, (x1, y1), (x2, y2), (255, 0, 0), 2)
+        # Display
+        h, w = image.shape
+        plt.figure(figsize=(w / 100, h / 100), dpi=100)
+        plt.axis("off")
+        plt.imshow(vis_image)
+        plt.tight_layout(pad=0)
+        plt.savefig(output_path)
+        plt.close()
+    def process(self, image_path: Path, visualize: bool = False) -> np.ndarray:
+        """
+        Process an image
+        """
+        image = cv2.imread(str(image_path), flags=0)
+        # Resize and detect
+        resized_image = cv2.resize(image, (self.process_width, self.process_height))
+        lines = self.lsd.detect(resized_image)[0].reshape(-1, 4)
+        # Scale lines to target size
+        lines = resize_bbx(
+            lines,
+            width=self.process_width,
+            height=self.process_height,
+            target_width=self.target_width,
+            target_height=self.target_height,
+        )
+        # Filter lines
+        lines = self.filter_lines(lines, self.target_height, self.target_width)
+        # Transform lines into valid bounding boxes
+        for i in range(len(lines)):
+            lines[i] = lines[i].tolist()
+            if not check_is_valid_bbx(lines[i]):
+                lines[i] = make_bbx_valid(lines[i])
+        # Visualize
+        if visualize:
+            target_image = cv2.resize(image, (self.target_width, self.target_height))
+            self.visualize(target_image, lines, f"{image_path.stem}_filter.jpg")
+        return lines

layout_reader/datasets/utils.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Helpers."""
+import gzip
+import json
+from pathlib import Path
+import numpy as np
+MAX_LEN = 510
+UNK_TOKEN_ID = 3
+# CLS_TOKEN_ID = 0
+# EOS_TOKEN_ID = 2
+CLS_SHIFT = 4  # 0 1 2 3 are already taken
+BBX_FACTOR = 1000
+def check_is_valid_bbx(bbx: list[int], min_value=0, max_value=1000) -> bool:
+    """
+    Check if box is valid.
+    """
+    x1, y1, x2, y2 = bbx
+    return (
+        (y1 < y2)
+        and (x1 < x2)
+        and (x2 < max_value)
+        and (y2 < max_value)
+        and (x1 >= min_value)
+        and (y1 >= min_value)
+    )
+def convert_to_bbx(
+    polygon: list[list[float]], bbx_factor, min_value=0, max_value=1000
+) -> list[int]:
+    """
+    Convert a polygon to a bounding box.
+    """
+    # Extract x and y from polygon
+    xs = np.array(polygon)[:, 0]
+    ys = np.array(polygon)[:, 1]
+    # Scale between min_value and max_value
+    x1 = max(min_value, int(xs.min() * bbx_factor))
+    y1 = max(min_value, int(ys.min() * bbx_factor))
+    x2 = min(max_value - 1, int(xs.max() * bbx_factor))
+    y2 = min(max_value - 1, int(ys.max() * bbx_factor))
+    return [x1, y1, x2, y2]
+def make_bbx_valid(box, min_value: int = 0, max_value: int = 1000) -> list[int]:
+    x1, y1, x2, y2 = box
+    # Clip
+    x1 = max(min_value, x1)
+    y1 = max(min_value, y1)
+    x2 = min(max_value - 1, x2)
+    y2 = min(max_value - 1, y2)
+    # Shift equal coordinates
+    if x1 == x2:
+        if x2 != max_value - 1:
+            x2 += 1
+        else:
+            x1 -= 1
+    if y1 == y2:
+        if y2 != max_value - 1:
+            y2 += 1
+        else:
+            x2 -= 1
+    return [x1, y1, x2, y2]
+def resize_bbx(
+    lines: np.ndarray, width: int, height: int, target_width: int, target_height: int
+) -> list[list[float]]:
+    w_ratio = target_width / width
+    h_ratio = target_height / height
+    newlines = []
+    for line in lines:
+        x1, y1, x2, y2 = line.tolist()
+        newlines.append(
+            np.array([x1 * w_ratio, y1 * h_ratio, x2 * w_ratio, y2 * h_ratio])
+        )
+    return np.array([(sep).astype(int) for sep in newlines])
+def save_gzip_jsonl(filename: Path, content: list[dict]) -> None:
+    """
+    Write content in GZIP JSONL format.
+    Args:
+        filename (Path): Output filename.
+        content (str): Content to write.
+    """
+    with gzip.open(filename, "wt") as f:
+        f.write("\n".join([json.dumps(c) for c in content]))
+def load_gzip_jsonl(filename: Path) -> list[dict]:
+    """
+    Read content in GZIP JSONL format.
+    Args:
+        filename (Path): Input filename.
+    """
+    with gzip.open(filename, "rt") as f:
+        content = f.read().splitlines()
+    return [json.loads(c) for c in content]
+def check_too_many_zones(
+    boxes: list[list[int]], separators: list[list[int]], max_len: int = 512
+) -> bool:
+    # Count total objects + [BOS] + [EOS]
+    return (len(separators) + len(boxes) + 2) >= max_len
+def load_yolo_line(line, bbx_factor) -> tuple[int, list[int]]:
+    parts = line.strip().split()
+    if len(parts) != 5:
+        raise ValueError(f"Invalid YOLO format: expected 5 values, got {len(parts)}")
+    classif = int(parts[0])
+    x_c, y_c, w, h = map(float, parts[1:])
+    x_min = (x_c - w / 2) * bbx_factor
+    y_min = (y_c - h / 2) * bbx_factor
+    x_max = (x_c + w / 2) * bbx_factor
+    y_max = (y_c + h / 2) * bbx_factor
+    box = [int(x_min), int(y_min), int(x_max), int(y_max)]
+    return classif, box

layout_reader/helpers.py ADDED Viewed

@@ -0,0 +1,358 @@
+import logging
+import random
+from collections import defaultdict
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+from colour import Color
+from datasets import load_dataset
+from PIL import Image, ImageDraw, ImageFont
+from transformers import LayoutLMv3ForTokenClassification
+logger = logging.getLogger(__name__)
+FONT = ImageFont.truetype("fonts/LinuxLibertine.ttf")
+# Maximum number of zones to be ordered.
+MAX_LEN = 510
+# Maximum coordinate after normalization
+MAX_COOR = 1000
+# Custom classes used by LayoutReader
+CLS_TOKEN_ID = 0  # Class token
+PAD_TOKEN_ID = 3  # Padding token
+EOS_TOKEN_ID = 2  # End-of-sequence token
+# Label to be ignored in the loss computation (padding, separators...)
+IGNORE_LABEL_ID = -100
+# Width (1000) divided in 10 columns => 100 pixels/columns
+COLUMN_WIDTH = 1000 // 10
+def sort_zones(zones, sort_method: str):
+    if sort_method == "random":
+        zones = list(zones)
+        random.shuffle(zones)
+        return
+    sort_keys = {
+        "sortxy_by_column": lambda z: (z[1][0] // COLUMN_WIDTH, z[1][1]),
+        "sortxy": lambda z: (z[1][0], z[1][1]),
+        "sortyx": lambda z: (z[1][1], z[1][0]),
+    }
+    zones.sort(key=sort_keys[sort_method])
+def sort_sample(
+    element, sort_ratio: float = 0.5, sort_method: str = "sortxy_by_column"
+):
+    """
+    Sort zones from a page.
+    """
+    # Get boxes or classes and sort them
+    boxes = element.get("target_boxes") or element.get("source_boxes")
+    classes = element.get("target_classes") or element.get("source_classes") or []
+    if random.random() > sort_ratio or not boxes:
+        return element
+    if classes:
+        zones = [
+            (i, box, cls)
+            for i, (box, cls) in enumerate(zip(boxes, classes, strict=True))
+        ]
+    else:
+        zones = [(i, box) for i, box in enumerate(boxes)]
+    # Sort the zones
+    sort_zones(zones, sort_method)
+    if "target_index" in element:
+        target_orders = (
+            np.argsort([zone[0] for zone in zones]) + 1
+        ).tolist()  # start at 1
+        _, boxes, classes = map(list, zip(*zones, strict=True))
+        element["target_index"] = target_orders
+        element["source_boxes"] = boxes
+        element["source_classes"] = classes
+        return element
+    if classes:
+        _, boxes, classes = map(list, zip(*zones, strict=True))
+    else:
+        _, boxes = map(list, zip(*zones, strict=True))
+    element["source_boxes"] = boxes
+    element["source_classes"] = classes
+    return element
+def read_yaml(filename: str):
+    if not Path(filename).exists():
+        raise FileNotFoundError(f"Configuration not found: {filename}")
+    return yaml.safe_load(Path(filename).read_text())
+def load_dataset_split(dataset_path: str, split: str):
+    filename = Path(dataset_path) / f"{split}.jsonl.gz"
+    if not filename.exists():
+        raise FileNotFoundError(f"Dataset file not found: {filename}")
+    try:
+        return load_dataset(
+            "json",
+            data_files={split: str(filename)},
+        )[split]
+    except Exception as e:
+        raise ValueError(f"Failed to load dataset file {filename}") from e
+def load_model(model_path: str):
+    try:
+        return LayoutLMv3ForTokenClassification.from_pretrained(
+            model_path,
+            device_map="auto",
+        )
+    except Exception as e:
+        raise ValueError(
+            f"Failed to load model from '{model_path}'. "
+            "Model path must be a valid Hugging Face model ID or a local directory."
+        ) from e
+class DataCollator:
+    def __init__(self, with_classes: bool = False, with_separators: bool = False):
+        self.with_classes = with_classes
+        self.with_separators = with_separators
+    def _truncate(self, seq: list, name: str, max_len: int) -> list:
+        if len(seq) > max_len:
+            logger.warning(
+                f"Truncated {name}. Length ({len(seq)}) exceeds MAX_LEN ({max_len})."
+            )
+            return seq[:max_len]
+        return seq
+    def _prepare_single_feature(self, feature: dict) -> dict[str, list]:
+        bboxes = feature["source_boxes"].copy()
+        len_boxes = len(bboxes)
+        separators = list(feature.get("separators", []))
+        if self.with_separators:
+            bboxes.extend(separators)
+        bboxes = self._truncate(bboxes, "bounding boxes", MAX_LEN)
+        # Prepare reading order
+        # Separators should be ignored in the loss
+        labels = feature["target_index"]
+        if self.with_separators:
+            labels.extend([IGNORE_LABEL_ID] * len(separators))
+        labels = self._truncate(labels, "labels", MAX_LEN)
+        # Prepare classes
+        input_ids = (
+            feature["source_classes"]
+            if self.with_classes
+            else [PAD_TOKEN_ID] * len_boxes
+        )
+        if self.with_separators:
+            input_ids.extend([PAD_TOKEN_ID] * len(separators))
+        input_ids = self._truncate(input_ids, "input_ids", MAX_LEN)
+        # Prepare attention mask
+        attention_mask = [1] * len(bboxes)
+        # Sanity check
+        assert len(bboxes) == len(labels) == len(input_ids) == len(attention_mask), (
+            f"Length mismatch: bbox={len(bboxes)}, labels={len(labels)}, "
+            f"input_ids={len(input_ids)}, attention_mask={len(attention_mask)}"
+        )
+        return {
+            "bbox": bboxes,
+            "labels": labels,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+    def _add_special_tokens(self, batch: dict[str, list[list]]) -> None:
+        """Add CLS and EOS tokens (in-place)."""
+        for i in range(len(batch["bbox"])):
+            batch["bbox"][i] = [[0, 0, 0, 0]] + batch["bbox"][i] + [[0, 0, 0, 0]]
+            batch["labels"][i] = (
+                [IGNORE_LABEL_ID] + batch["labels"][i] + [IGNORE_LABEL_ID]
+            )
+            batch["input_ids"][i] = (
+                [CLS_TOKEN_ID] + batch["input_ids"][i] + [EOS_TOKEN_ID]
+            )
+            batch["attention_mask"][i] = [1] + batch["attention_mask"][i] + [1]
+    def _pad_sequences(self, batch: dict[str, list[list]]) -> None:
+        """Pad all sequences to max length (in-place)."""
+        max_len = max(len(x) for x in batch["bbox"])
+        for i in range(len(batch["bbox"])):
+            pad_len = max_len - len(batch["bbox"][i])
+            batch["bbox"][i] += [[0, 0, 0, 0]] * pad_len
+            batch["labels"][i] += [IGNORE_LABEL_ID] * pad_len
+            batch["input_ids"][i] += [PAD_TOKEN_ID] * pad_len
+            batch["attention_mask"][i] += [0] * pad_len
+    def __call__(self, features: list[dict]) -> dict[str, torch.Tensor]:
+        batch = {
+            "bbox": [],
+            "labels": [],
+            "input_ids": [],
+            "attention_mask": [],
+        }
+        for feature in features:
+            processed = self._prepare_single_feature(feature)
+            for key in batch:
+                batch[key].append(processed[key])
+        # Add special tokens
+        self._add_special_tokens(batch)
+        # Pad to same length
+        self._pad_sequences(batch)
+        batch_tensors = {
+            key: torch.tensor(batch[key], dtype=torch.long) for key in batch
+        }
+        # Post-process labels
+        labels_tensor = batch_tensors["labels"]
+        labels_tensor[labels_tensor > MAX_LEN] = IGNORE_LABEL_ID
+        labels_tensor[labels_tensor > 0] -= 1
+        batch_tensors["labels"] = labels_tensor
+        return batch_tensors
+def boxes_to_inputs(boxes, cls, separators) -> dict[str, torch.Tensor]:
+    all_boxes = boxes.copy()
+    if separators:
+        all_boxes += separators
+        if cls:
+            cls += [PAD_TOKEN_ID] * len(separators)
+    if not cls:
+        cls = [PAD_TOKEN_ID] * len(all_boxes)
+    bbox = [[0, 0, 0, 0]] + all_boxes + [[0, 0, 0, 0]]
+    input_ids = [CLS_TOKEN_ID] + cls + [EOS_TOKEN_ID]
+    attention_mask = [1] + [1] * len(all_boxes) + [1]
+    return {
+        "bbox": torch.tensor([bbox]),
+        "attention_mask": torch.tensor([attention_mask]),
+        "input_ids": torch.tensor([input_ids]),
+    }
+def prepare_inputs(
+    inputs: dict[str, torch.Tensor], model: LayoutLMv3ForTokenClassification
+) -> dict[str, torch.Tensor]:
+    prepared = {}
+    for key, tensor in inputs.items():
+        tensor = tensor.to(model.device)
+        if torch.is_floating_point(tensor):
+            tensor = tensor.to(model.dtype)
+        prepared[key] = tensor
+    return prepared
+def parse_logits(logits: torch.Tensor, length: int) -> list[int]:
+    """
+    Convert logits to reading orders.
+    """
+    def _find_conflicts(assigned_orders: list[int]) -> dict[int, list[int]]:
+        order_to_elements = defaultdict(list)
+        for element_idx, order in enumerate(assigned_orders):
+            order_to_elements[order].append(element_idx)
+        # Keep only positions with conflicts
+        return {
+            order: elements
+            for order, elements in order_to_elements.items()
+            if len(elements) > 1
+        }
+    def _resolve_conflicts(
+        conflicts: dict[int, list[int]],
+        assigned_orders: list[int],
+        candidate_orders: list[list[int]],
+        logits: torch.Tensor,
+    ) -> None:
+        """
+        Resolve conflicts based on logits values (keep highest).
+        """
+        for order, element_indices in conflicts.items():
+            # Get logit scores for all elements predicted to current order
+            elements_by_score = [
+                (element_idx, logits[element_idx, order].item())
+                for element_idx in element_indices
+            ]
+            elements_by_score.sort(key=lambda x: x[1], reverse=True)
+            # Reassign all but the highest-scoring element to next candidates
+            for element_idx, _ in elements_by_score[1:]:
+                assigned_orders[element_idx] = candidate_orders[element_idx].pop()
+    # Extract relevant logits (skip special tokens)
+    logits = logits[1 : length + 1, :length]
+    # Get sorted candidate positions for each element (ascending order)
+    # Each row contains position candidates from lowest to highest score
+    candidate_orders = logits.argsort(descending=False).tolist()
+    # Initialize with best candidate for each element
+    assigned_orders = [candidates.pop() for candidates in candidate_orders]
+    while True:
+        conflicts = _find_conflicts(assigned_orders)
+        if not conflicts:
+            break
+        _resolve_conflicts(conflicts, assigned_orders, candidate_orders, logits)
+    return assigned_orders
+def check_duplicate(seq: list[int]) -> bool:
+    return len(seq) != len(set(seq))
+def save_visualization(
+    image_path: Path,
+    boxes: list[list[int]],
+    predicted_order: list[int],
+    output_path: Path,
+):
+    colors = list(Color("red").range_to(Color("green"), len(boxes)))
+    page = Image.open(image_path)
+    center = (0, 0)
+    for order, index in enumerate(predicted_order):
+        x1, y1, x2, y2 = boxes[index]
+        x1 = int(x1 / MAX_COOR * page.width)
+        y1 = int(y1 / MAX_COOR * page.height)
+        x2 = int(x2 / MAX_COOR * page.width)
+        y2 = int(y2 / MAX_COOR * page.height)
+        draw = ImageDraw.Draw(page)
+        if order > 0:
+            draw.line(
+                [center, ((x1 + x2) / 2, (y1 + y2) / 2)],
+                fill=colors[order].hex,
+                width=2,
+            )
+        draw.rectangle([(x1, y1), (x2, y2)], outline=colors[order].hex, width=4)
+        draw.text(
+            (x1, y1), text=str(order), font=FONT, fill=colors[order].hex, align="left"
+        )
+        center = ((x1 + x2) / 2, (y1 + y2) / 2)
+    page.save(output_path, "JPEG")