PyPI - inventory-ocr - Versions diffs - 0.0.1__py3-none-any.whl - Mend

inventory-ocr 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

inventory_ocr/__init__.py +0 -0
inventory_ocr/annotate.py +53 -0
inventory_ocr/config/example_regions.yaml +16 -0
inventory_ocr/detection.py +95 -0
inventory_ocr/postprocessor.py +249 -0
inventory_ocr/recognition.py +302 -0
inventory_ocr/resources/example_output.json +4848 -0
inventory_ocr/run.py +169 -0
inventory_ocr/utils.py +50 -0
inventory_ocr-0.0.1.dist-info/METADATA +24 -0
inventory_ocr-0.0.1.dist-info/RECORD +15 -0
inventory_ocr-0.0.1.dist-info/WHEEL +5 -0
inventory_ocr-0.0.1.dist-info/entry_points.txt +2 -0
inventory_ocr-0.0.1.dist-info/licenses/LICENSE +201 -0
inventory_ocr-0.0.1.dist-info/top_level.txt +1 -0

inventory_ocr/__init__.py ADDED Viewed

File without changes

inventory_ocr/annotate.py ADDED Viewed

@@ -0,0 +1,53 @@
+import gradio as gr
+from gradio_image_annotation import image_annotator
+import os
+import yaml
+def create_annotation_app(input_dir, layout_file_path):
+    """
+    Returns a gr.Blocks app to annotate the layout.
+    TODOS: Display a nice success message and close the browser window before killing the app serving process. But gradio conditional rendering is a pain in the...
+    """
+    print("Loading template image for region annotation...")
+    img_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+    assert img_files, "No image files found in the input directory."
+    template_image_path = os.path.join(input_dir, img_files[0])
+    def set_layout(layout, state):
+        return state + [layout]
+    def extract_layout(annotations, layout_state):
+        (h, w, c) = annotations['image'].shape
+        regions = {}
+        for box in annotations['boxes']:
+            xmin = int(box['xmin']) / w
+            xmax = int(box['xmax']) / w
+            ymin = int(box['ymin']) / h
+            ymax = int(box['ymax']) / h
+            label = box['label']
+            regions[label] = [xmin, ymin, xmax, ymax]
+        print(f"Saving layout configuration to {layout_file_path}...")
+        with open(layout_file_path, 'w') as f:
+            yaml.dump({'regions': regions}, f, default_flow_style=False)
+        os._exit(0)
+        return gr.Markdown("Layout succesfully defined. You may now close this window."), regions
+    with gr.Blocks() as annotation_app:
+        gr.Markdown("# Layout Annotation Tool")
+        layout_state = gr.State([])
+        gr.Markdown("### Annotate the Image")
+        @gr.render(inputs=layout_state)
+        def show_annotation_interface(layout):
+            annotations = image_annotator({"image": template_image_path, "boxes": []},
+                                            label_list=["Field Name"],
+                                            label_colors=[(0, 255, 0)],
+                                            boxes_alpha=0.4)
+            extract_layout_button = gr.Button("Extract Layout")
+            close_message =  gr.Markdown("", visible=False)
+            extract_layout_button.click(extract_layout, inputs=[annotations, layout_state], outputs=[close_message, layout_state])
+    return annotation_app

inventory_ocr/config/example_regions.yaml ADDED Viewed

@@ -0,0 +1,16 @@
+regions:
+  Gegenstand: [0, 0, 1, 0.1]
+  Inv. Nr.: [0.044, 0.08, 0.275, 0.145]
+  Herkunft: [0.2737, 0.0806, 0.5054, 0.237]
+  Foto Notes: [0.5054, 0.0806, 1, 0.237]
+  Standort: [0.047, 0.135, 0.275, 0.235]
+  Material: [0.047, 0.237, 0.275, 0.435]
+  Datierung: [0.275, 0.237, 0.505, 0.317]
+  Maße: [0.275, 0.317, 0.505, 0.435]
+  erworben von: [0.047, 0.435, 0.505, 0.521]
+  Beschreibung: [0.047, 0.521, 0.505, 0.794]
+  Ausstellungen: [0.047, 0.795, 0.505, 1]
+  am: [0.505, 0.435, 0.707, 0.53]
+  Preis: [0.707, 0.435, 0.869, 0.53]
+  Vers.-Wert: [0.870, 0.435, 1.0, 0.53]
+  Literatur: [0.50, 0.53, 1.0, 1.0]

inventory_ocr/detection.py ADDED Viewed

@@ -0,0 +1,95 @@
+from ultralytics import YOLO
+from inventory_ocr.utils import download_and_unzip
+import glob
+import shutil
+import torch
+import cv2
+import os
+class Detector:
+    """Abstract base class for different detector implementations."""
+    def parse_directory(self, input_dir, crop_dir='tmp', output_base_dir='output'):
+        """Parse a directory of images and save cropped images to the output directory."""
+        raise NotImplementedError("This method should be overridden by subclasses.")
+    def detect(self, image):
+        """Detect objects in a single image."""
+        raise NotImplementedError("This method should be overridden by subclasses.")
+    def crop_and_save(self, detections, out_dir, name):
+        """Crop detected objects and save them to the specified directory."""
+        raise NotImplementedError("This method should be overridden by subclasses.")
+class YoloImageDetector(Detector):
+    def __init__(self, resources_path, chunk_size=50,
+                 weights_url='https://faubox.rrze.uni-erlangen.de/dl/fi9iK4rseupfrrTeXWQUGP/weights.zip'):
+        self._prepare_resources(resources_path, weights_url)
+        self.model = YOLO(os.path.join(resources_path, 'yolov8.pt'))
+        self.chunk_size = chunk_size
+        self.device = 'mps' if torch.backends.mps.is_available() else 'cpu'
+        print(f'Detection running on {self.device}')
+    def _prepare_resources(self, resources_path, weights_url):
+        if os.path.exists(os.path.join(resources_path, 'yolov8.pt')):
+            return
+        print(f'Downloading YOLO weights to {os.path.abspath(resources_path)}...')
+        download_and_unzip(weights_url, resources_path)
+    def _batch(self, iterable, n=1):
+        l = len(iterable)
+        for ndx in range(0, l, n):
+            yield iterable[ndx:ndx+n]
+    def _move_crops(self, yolo_name, out_dir):
+        yolo_output = os.path.join(out_dir, yolo_name)
+        crop_dir = os.path.join(out_dir, 'images')
+        if not os.path.isdir(crop_dir):
+            os.makedirs(crop_dir)
+        for file in glob.glob(os.path.join(yolo_output, '**', '*.jpg'), recursive=True):
+            fn = os.path.basename(file)
+            shutil.move(file, os.path.join(crop_dir, fn))
+        shutil.rmtree(yolo_output)
+        print(f'Detected images moved to \033[1m{crop_dir}\033[0m')
+    def parse_directory(self, input_dir, crop_dir='tmp', output_base_dir='output'):
+        image_exts = ['.jpg', '.jpeg']
+        images_to_process = [os.path.join(input_dir, fn) for fn in os.listdir(input_dir) if os.path.splitext(fn)[1] in image_exts]
+        n_chunks = len(images_to_process) // self.chunk_size + 1
+        i = 1
+        for img_chunk in self._batch(images_to_process, self.chunk_size):
+            print(f'Detecting images in chunk {i}/{n_chunks}..')
+            self.model.predict(img_chunk, save_crop=True, device=self.device, name=crop_dir, project=output_base_dir)
+            self._move_crops(crop_dir, output_base_dir)
+            i += 1
+    def detect(self,image):
+        results = self.model.predict(image, device=self.device, max_det=1)
+        return results
+    def crop_and_save(self, detections, out_dir, name):
+        if not os.path.isdir(out_dir):
+            os.makedirs(out_dir)
+        for i, result in enumerate(detections):
+            if result.boxes is None or len(result.boxes) == 0:
+                continue # no image found
+            # Sort boxes by confidence in descending order
+            x1,y1,x2,y2 = sorted(result.boxes, key=lambda box: box.conf, reverse=True)[0].xyxy[0].flatten().int().tolist()
+            crop = result.orig_img[y1:y2,x1:x2]
+            cv2.imwrite(os.path.join(out_dir, name),crop)
+class DummyDetector(Detector):
+    def __init__(self, chunk_size=50):
+        self.chunk_size = chunk_size
+        print(f'Dummy detector doing nothing, OCR only.')
+    def parse_directory(self, input_dir, crop_dir='tmp', output_base_dir='output'):
+        pass
+    def detect(self, image):
+        return []
+    def crop_and_save(self, detections, out_dir, name):
+        pass

inventory_ocr/postprocessor.py ADDED Viewed

@@ -0,0 +1,249 @@
+from Levenshtein import distance
+import re
+import csv
+class PostProcessor:
+    def __init__(self):
+        self.delimiter = ','
+    def _write_to_csv(self, data, output_csv):
+        """
+        Write the processed data to a CSV file.
+        """
+        with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
+            writer = csv.DictWriter(file, fieldnames=data[0].keys(),delimiter=self.delimiter)
+            writer.writeheader()
+            writer.writerows(data)
+            print(f"Processed data written to {output_csv}")
+    def _remove_one_header(self, field_value: str, header: str) -> str:
+        def treat_hyphens(field_value: str) -> str:
+            # Replace all dash variants with a standard hyphen
+            return re.sub(r'[\u2010-\u2015\u2212\uFE58\uFE63\uFF0D\-]', '-', field_value)
+        field_value = treat_hyphens(field_value)
+        match = re.search(fr'{header}\s*:', field_value, re.IGNORECASE)
+        if match:
+            field_value = field_value[match.end():].strip()
+            field_value = field_value.split(header)[-1].strip()
+        return field_value
+    def _remove_title_parts(self, row: dict) -> dict:
+        updated_inventory_data = {}
+        for k,v in row.items():
+            updated_inventory_data[k] = self._remove_one_header(v,k)
+        return updated_inventory_data
+    def _update_one_entry(self, row: dict) -> dict:
+        """
+        Update a single entry in the row based on specific rules.
+        This method can be overridden by subclasses to implement custom logic.
+        """
+        return row
+    def postprocess(self, input_csv, output_csv):
+        """
+        Post-process the data after OCR.
+        """
+        with open(input_csv, mode='r', encoding='utf-8') as file:
+            reader = csv.DictReader(file)
+            input_data = [row for row in reader]
+        updated_data = []
+        for row in input_data:
+            updated_row = self._remove_title_parts(row)
+            updated_row = self._update_one_entry(updated_row)
+            updated_data.append(updated_row)
+        self._write_to_csv(updated_data, output_csv)
+class BenchmarkingPostProcessor(PostProcessor):
+    def __init__(self):
+        super().__init__()
+        self._empty_marker = ''
+        print('Benchmarking postprocessor initialized.')
+    def _handle_masse(self, row: dict) -> str:
+        if 'Gewicht' in row and row['Gewicht'] != '' and 'Maße' in row:
+            return row['Maße'], row['Gewicht'] # treat cases where gewicht is handled by ocr engine (ie Mistral)
+        masse_str = row.get('Maße', '')
+        # Extract all cohesive numbers before 'g' as weight, others as measurement
+        weights = []
+        measurements = []
+        # Find all numbers followed by optional whitespace and 'g' (weight)
+        weight_matches = re.findall(r'(\d+(?:[.,]\d+)?\s*g)', masse_str)
+        weights.extend(weight_matches)
+        # Remove weights from the string to avoid double counting
+        masse_str_no_weights = re.sub(r'\d+(?:[.,]\d+)?\s*g', '', masse_str)
+        # Return measurements and weight as a tuple of concatenated strings
+        weight_str = ', '.join(weights) if weights else self._empty_marker
+        masse_str_no_weights = self._remove_one_header(masse_str_no_weights, 'Gewicht')
+        return masse_str_no_weights, weight_str
+    def _update_one_entry(self, row: dict) -> dict:
+        """
+        Update a single entry in the row based on benchmarking rules.
+        """
+        updated_row = {}
+        for k, v in row.items():
+            if v is None or v.strip() == '':
+                updated_row[k] = self._empty_marker
+            else:
+                updated_row[k] = v.strip()
+        updated_row['filename'] = row.get('source_file', self._empty_marker)
+        del updated_row['source_file']
+        updated_row['Inventarnummer'] = row.get('Inv. Nr.', self._empty_marker)
+        del updated_row['Inv. Nr.']
+        updated_row['erworben am'] = row.get('am', self._empty_marker)
+        del updated_row['am']
+        updated_row['Versicherungswert'] = row.get('Vers.-Wert', self._empty_marker)
+        del updated_row['Vers.-Wert']
+        measurements, weight = self._handle_masse(row)
+        updated_row['Masse'] = measurements
+        updated_row['Gewicht'] = weight
+        del updated_row['Maße']
+        return updated_row
+class SchmuckPostProcessor(PostProcessor):
+    def __init__(self):
+        print("Using SchmuckPostProcessor for production mode.")
+        super().__init__()
+        self._empty_marker = 'Unbekannt'
+        self.delimiter = ';'  # Use semicolon as delimiter for Schmuck CSV
+        # spacy.cli.download("de_core_news_sm")
+        # self.nlp = spacy.load("de_core_news_sm")
+    def _extract_price_and_currency(self, price_str: str) -> tuple:
+        def is_donated(price_str):
+            if distance(price_str.strip(), 'Stiftung') <= 1:
+                return True
+            if distance(price_str.strip(), 'Geschenk') <= 1:
+                return True
+            return False
+        if not price_str or price_str.strip() == '':
+            price = 'Unbekannt'
+        else:
+            price = re.sub(r'[^\d]', '', price_str)  # Remove non-digit characters
+        if is_donated(price_str):
+            return 0, 'Deutsche Mark'
+        if 'DM' in price_str or 'Dm' in price_str:
+            return price, 'Deutsche Mark'
+        if 'M' in price_str:
+            return price, 'Reichsmark (Deutsches Reich)'
+        return price, 'Deutsche Mark'
+    def _is_bought(self, row: dict) -> bool:
+        erworben = row.get('erworben von', '').strip()
+        if erworben.lower() == 'stiftung':
+            return False
+        if not row['Preis'] or row['Preis'].strip() == '':
+            return False
+        return True
+    def _extract_notes(self, row: dict) -> str | None:
+        notes = row.get('Literatur')
+        if not self._is_bought(row) and row.get('erworben von') != '':
+            notes += f"Angaben aus dem Inventarkartenfeld 'erworben von': {row.get('erworben von')}"
+        return notes
+    def _extract_standort(self, standort: str) -> str:
+        if not standort or standort.strip() == '':
+            return self._empty_marker
+        return "alter Standort: " + standort
+    def _extract_erwerb(self, row: dict) -> list:
+        # erworben_doc = self.nlp(row.get('erworben von', ''))
+        # persons = [ent.text for ent in erworben_doc.ents if ent.label_ == 'PER']
+        # places = [ent.text for ent in erworben_doc.ents if ent.label_ == 'LOC']
+        # TODO
+        erworben_str = row.get('erworben von')
+        preis_str = row.get('Preis')
+        matches = re.search("Hersteller|Entwurf|Ausführung|Herst.|Entw.|Ausf.", erworben_str, flags=re.IGNORECASE)
+        if not matches:
+            row = row
+        return row
+    def _extract_description(self, row: dict) -> str:
+        DEFAULT_DESCRIPTION = 'Dieses Schmuckstück ist aus dem historischen Schmuckinventar der Kunstgewerbeschule Pforzheim.'
+        beschreibung = row.get('Beschreibung', DEFAULT_DESCRIPTION)
+        if not beschreibung or beschreibung.strip() == '':
+            beschreibung = DEFAULT_DESCRIPTION
+        return beschreibung
+    def _update_one_entry(self, row: dict) -> dict:
+        """
+        Update a single entry in the row based on rules.
+        """
+        unchanged_keys = []
+        def get_or_default(row: dict, key: str, default=self._empty_marker) -> str:
+            value = row.get(key, '')
+            if value is None or value.strip() == '':
+                return default
+            return value
+        updated_row = {}
+        for k in unchanged_keys:
+            updated_row[k] = get_or_default(row, k)
+        updated_row['object_title'] = get_or_default(row, 'Gegenstand')
+        updated_row['object_type'] = "Schmuck"
+        updated_row['inventory_number'] = get_or_default(row, 'Inv. Nr.')
+        updated_row['remarks_short'] = get_or_default(row, 'source_file')
+        updated_row['remarks_long'] = get_or_default(row, 'Maße')
+        updated_row['literature_title1'] = get_or_default(row, 'Literatur')
+        updated_row['abode_regular'] = self._extract_standort(row.get('Standort'))
+        updated_row["abode_actual"] = "Schmuckmuseum Pforzheim"
+        updated_row['material_separate'] = get_or_default(row, 'Material')
+        updated_row['object_description'] = self._extract_description(row)
+        insurance_value, insurance_value_currency = self._extract_price_and_currency(row.get('Vers.-Wert', ''))
+        updated_row['worth_insurance_value'] = insurance_value
+        updated_row['worth_insurance_unit'] = insurance_value_currency
+        # updated_row['Notizen'] = self._extract_notes(row) or empty_marker
+        updated_row['exhibition_name1'] = get_or_default(row, 'Ausstellungen')
+        updated_row['image_name1'] = get_or_default(row,'Foto Notes')
+        updated_row['image_owner1'] = 'Schmuckmuseum Pforzheim'
+        updated_row['image_rights1'] = 'RR-R'
+        updated_row['image_visible1'] = 'y'
+        updated_row['image_main1'] = 'y'
+        updated_row['form_designed_when1'] = get_or_default(row, 'Datierung')
+        updated_row['form_designed_who1'] = self._empty_marker
+        updated_row['form_designed_where1'] = get_or_default(row, 'Herkunft')
+        # internal fields
+        updated_row['acquisition_type'] = self._empty_marker
+        updated_row['acquisition_name'] = 'Erwerb'
+        updated_row['acquisition_source_name'] = get_or_default(row, 'erworben von')
+        updated_row['acquisition_date'] = get_or_default(row, 'am', default='3000-01-01')
+        acquisition_price, acquisition_price_currency = self._extract_price_and_currency(row.get('Preis', ''))
+        updated_row['acquisition_price'] = acquisition_price
+        updated_row['acquisition_price_currency'] = acquisition_price_currency
+        updated_row['acquisition_note'] = "Art des Zugangs ist zu überprüfen."
+        # copied from acquisition for potential publication
+        updated_row['received_ownership_when1'] = updated_row['acquisition_date']
+        updated_row['received_ownership_who1'] = updated_row['acquisition_source_name']
+        updated_row['received_ownership_where1'] = 'Pforzheim'
+        updated_row['received_ownership_where_sure1'] = 'n'
+        return updated_row