inventory-ocr 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,53 @@
1
+ import gradio as gr
2
+ from gradio_image_annotation import image_annotator
3
+ import os
4
+ import yaml
5
+
6
+ def create_annotation_app(input_dir, layout_file_path):
7
+ """
8
+ Returns a gr.Blocks app to annotate the layout.
9
+ TODOS: Display a nice success message and close the browser window before killing the app serving process. But gradio conditional rendering is a pain in the...
10
+ """
11
+ print("Loading template image for region annotation...")
12
+ img_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
13
+ assert img_files, "No image files found in the input directory."
14
+ template_image_path = os.path.join(input_dir, img_files[0])
15
+
16
+ def set_layout(layout, state):
17
+ return state + [layout]
18
+
19
+ def extract_layout(annotations, layout_state):
20
+ (h, w, c) = annotations['image'].shape
21
+
22
+ regions = {}
23
+ for box in annotations['boxes']:
24
+ xmin = int(box['xmin']) / w
25
+ xmax = int(box['xmax']) / w
26
+ ymin = int(box['ymin']) / h
27
+ ymax = int(box['ymax']) / h
28
+ label = box['label']
29
+ regions[label] = [xmin, ymin, xmax, ymax]
30
+ print(f"Saving layout configuration to {layout_file_path}...")
31
+ with open(layout_file_path, 'w') as f:
32
+ yaml.dump({'regions': regions}, f, default_flow_style=False)
33
+ os._exit(0)
34
+ return gr.Markdown("Layout succesfully defined. You may now close this window."), regions
35
+
36
+ with gr.Blocks() as annotation_app:
37
+ gr.Markdown("# Layout Annotation Tool")
38
+
39
+ layout_state = gr.State([])
40
+
41
+ gr.Markdown("### Annotate the Image")
42
+
43
+ @gr.render(inputs=layout_state)
44
+ def show_annotation_interface(layout):
45
+ annotations = image_annotator({"image": template_image_path, "boxes": []},
46
+ label_list=["Field Name"],
47
+ label_colors=[(0, 255, 0)],
48
+ boxes_alpha=0.4)
49
+ extract_layout_button = gr.Button("Extract Layout")
50
+ close_message = gr.Markdown("", visible=False)
51
+ extract_layout_button.click(extract_layout, inputs=[annotations, layout_state], outputs=[close_message, layout_state])
52
+
53
+ return annotation_app
@@ -0,0 +1,16 @@
1
+ regions:
2
+ Gegenstand: [0, 0, 1, 0.1]
3
+ Inv. Nr.: [0.044, 0.08, 0.275, 0.145]
4
+ Herkunft: [0.2737, 0.0806, 0.5054, 0.237]
5
+ Foto Notes: [0.5054, 0.0806, 1, 0.237]
6
+ Standort: [0.047, 0.135, 0.275, 0.235]
7
+ Material: [0.047, 0.237, 0.275, 0.435]
8
+ Datierung: [0.275, 0.237, 0.505, 0.317]
9
+ Maße: [0.275, 0.317, 0.505, 0.435]
10
+ erworben von: [0.047, 0.435, 0.505, 0.521]
11
+ Beschreibung: [0.047, 0.521, 0.505, 0.794]
12
+ Ausstellungen: [0.047, 0.795, 0.505, 1]
13
+ am: [0.505, 0.435, 0.707, 0.53]
14
+ Preis: [0.707, 0.435, 0.869, 0.53]
15
+ Vers.-Wert: [0.870, 0.435, 1.0, 0.53]
16
+ Literatur: [0.50, 0.53, 1.0, 1.0]
@@ -0,0 +1,95 @@
1
+ from ultralytics import YOLO
2
+ from inventory_ocr.utils import download_and_unzip
3
+ import glob
4
+ import shutil
5
+ import torch
6
+ import cv2
7
+ import os
8
+
9
+
10
+ class Detector:
11
+ """Abstract base class for different detector implementations."""
12
+ def parse_directory(self, input_dir, crop_dir='tmp', output_base_dir='output'):
13
+ """Parse a directory of images and save cropped images to the output directory."""
14
+ raise NotImplementedError("This method should be overridden by subclasses.")
15
+
16
+ def detect(self, image):
17
+ """Detect objects in a single image."""
18
+ raise NotImplementedError("This method should be overridden by subclasses.")
19
+
20
+ def crop_and_save(self, detections, out_dir, name):
21
+ """Crop detected objects and save them to the specified directory."""
22
+ raise NotImplementedError("This method should be overridden by subclasses.")
23
+
24
+
25
+ class YoloImageDetector(Detector):
26
+ def __init__(self, resources_path, chunk_size=50,
27
+ weights_url='https://faubox.rrze.uni-erlangen.de/dl/fi9iK4rseupfrrTeXWQUGP/weights.zip'):
28
+ self._prepare_resources(resources_path, weights_url)
29
+ self.model = YOLO(os.path.join(resources_path, 'yolov8.pt'))
30
+ self.chunk_size = chunk_size
31
+ self.device = 'mps' if torch.backends.mps.is_available() else 'cpu'
32
+ print(f'Detection running on {self.device}')
33
+
34
+ def _prepare_resources(self, resources_path, weights_url):
35
+ if os.path.exists(os.path.join(resources_path, 'yolov8.pt')):
36
+ return
37
+ print(f'Downloading YOLO weights to {os.path.abspath(resources_path)}...')
38
+ download_and_unzip(weights_url, resources_path)
39
+
40
+ def _batch(self, iterable, n=1):
41
+ l = len(iterable)
42
+ for ndx in range(0, l, n):
43
+ yield iterable[ndx:ndx+n]
44
+
45
+ def _move_crops(self, yolo_name, out_dir):
46
+ yolo_output = os.path.join(out_dir, yolo_name)
47
+ crop_dir = os.path.join(out_dir, 'images')
48
+ if not os.path.isdir(crop_dir):
49
+ os.makedirs(crop_dir)
50
+ for file in glob.glob(os.path.join(yolo_output, '**', '*.jpg'), recursive=True):
51
+ fn = os.path.basename(file)
52
+ shutil.move(file, os.path.join(crop_dir, fn))
53
+ shutil.rmtree(yolo_output)
54
+ print(f'Detected images moved to \033[1m{crop_dir}\033[0m')
55
+
56
+ def parse_directory(self, input_dir, crop_dir='tmp', output_base_dir='output'):
57
+ image_exts = ['.jpg', '.jpeg']
58
+ images_to_process = [os.path.join(input_dir, fn) for fn in os.listdir(input_dir) if os.path.splitext(fn)[1] in image_exts]
59
+ n_chunks = len(images_to_process) // self.chunk_size + 1
60
+ i = 1
61
+ for img_chunk in self._batch(images_to_process, self.chunk_size):
62
+ print(f'Detecting images in chunk {i}/{n_chunks}..')
63
+ self.model.predict(img_chunk, save_crop=True, device=self.device, name=crop_dir, project=output_base_dir)
64
+ self._move_crops(crop_dir, output_base_dir)
65
+ i += 1
66
+
67
+ def detect(self,image):
68
+ results = self.model.predict(image, device=self.device, max_det=1)
69
+ return results
70
+
71
+ def crop_and_save(self, detections, out_dir, name):
72
+ if not os.path.isdir(out_dir):
73
+ os.makedirs(out_dir)
74
+ for i, result in enumerate(detections):
75
+ if result.boxes is None or len(result.boxes) == 0:
76
+ continue # no image found
77
+ # Sort boxes by confidence in descending order
78
+ x1,y1,x2,y2 = sorted(result.boxes, key=lambda box: box.conf, reverse=True)[0].xyxy[0].flatten().int().tolist()
79
+ crop = result.orig_img[y1:y2,x1:x2]
80
+ cv2.imwrite(os.path.join(out_dir, name),crop)
81
+
82
+
83
+ class DummyDetector(Detector):
84
+ def __init__(self, chunk_size=50):
85
+ self.chunk_size = chunk_size
86
+ print(f'Dummy detector doing nothing, OCR only.')
87
+
88
+ def parse_directory(self, input_dir, crop_dir='tmp', output_base_dir='output'):
89
+ pass
90
+
91
+ def detect(self, image):
92
+ return []
93
+
94
+ def crop_and_save(self, detections, out_dir, name):
95
+ pass
@@ -0,0 +1,249 @@
1
+ from Levenshtein import distance
2
+ import re
3
+ import csv
4
+
5
+ class PostProcessor:
6
+ def __init__(self):
7
+ self.delimiter = ','
8
+
9
+ def _write_to_csv(self, data, output_csv):
10
+ """
11
+ Write the processed data to a CSV file.
12
+ """
13
+ with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
14
+ writer = csv.DictWriter(file, fieldnames=data[0].keys(),delimiter=self.delimiter)
15
+ writer.writeheader()
16
+ writer.writerows(data)
17
+ print(f"Processed data written to {output_csv}")
18
+
19
+ def _remove_one_header(self, field_value: str, header: str) -> str:
20
+ def treat_hyphens(field_value: str) -> str:
21
+ # Replace all dash variants with a standard hyphen
22
+ return re.sub(r'[\u2010-\u2015\u2212\uFE58\uFE63\uFF0D\-]', '-', field_value)
23
+ field_value = treat_hyphens(field_value)
24
+ match = re.search(fr'{header}\s*:', field_value, re.IGNORECASE)
25
+ if match:
26
+ field_value = field_value[match.end():].strip()
27
+ field_value = field_value.split(header)[-1].strip()
28
+ return field_value
29
+
30
+ def _remove_title_parts(self, row: dict) -> dict:
31
+ updated_inventory_data = {}
32
+ for k,v in row.items():
33
+ updated_inventory_data[k] = self._remove_one_header(v,k)
34
+ return updated_inventory_data
35
+
36
+ def _update_one_entry(self, row: dict) -> dict:
37
+ """
38
+ Update a single entry in the row based on specific rules.
39
+ This method can be overridden by subclasses to implement custom logic.
40
+ """
41
+ return row
42
+
43
+ def postprocess(self, input_csv, output_csv):
44
+ """
45
+ Post-process the data after OCR.
46
+ """
47
+ with open(input_csv, mode='r', encoding='utf-8') as file:
48
+ reader = csv.DictReader(file)
49
+ input_data = [row for row in reader]
50
+ updated_data = []
51
+ for row in input_data:
52
+ updated_row = self._remove_title_parts(row)
53
+ updated_row = self._update_one_entry(updated_row)
54
+ updated_data.append(updated_row)
55
+
56
+ self._write_to_csv(updated_data, output_csv)
57
+
58
+ class BenchmarkingPostProcessor(PostProcessor):
59
+ def __init__(self):
60
+ super().__init__()
61
+ self._empty_marker = ''
62
+ print('Benchmarking postprocessor initialized.')
63
+
64
+ def _handle_masse(self, row: dict) -> str:
65
+ if 'Gewicht' in row and row['Gewicht'] != '' and 'Maße' in row:
66
+ return row['Maße'], row['Gewicht'] # treat cases where gewicht is handled by ocr engine (ie Mistral)
67
+ masse_str = row.get('Maße', '')
68
+ # Extract all cohesive numbers before 'g' as weight, others as measurement
69
+ weights = []
70
+ measurements = []
71
+
72
+ # Find all numbers followed by optional whitespace and 'g' (weight)
73
+ weight_matches = re.findall(r'(\d+(?:[.,]\d+)?\s*g)', masse_str)
74
+ weights.extend(weight_matches)
75
+
76
+ # Remove weights from the string to avoid double counting
77
+ masse_str_no_weights = re.sub(r'\d+(?:[.,]\d+)?\s*g', '', masse_str)
78
+
79
+ # Return measurements and weight as a tuple of concatenated strings
80
+ weight_str = ', '.join(weights) if weights else self._empty_marker
81
+
82
+ masse_str_no_weights = self._remove_one_header(masse_str_no_weights, 'Gewicht')
83
+
84
+ return masse_str_no_weights, weight_str
85
+
86
+ def _update_one_entry(self, row: dict) -> dict:
87
+ """
88
+ Update a single entry in the row based on benchmarking rules.
89
+ """
90
+ updated_row = {}
91
+ for k, v in row.items():
92
+ if v is None or v.strip() == '':
93
+ updated_row[k] = self._empty_marker
94
+ else:
95
+ updated_row[k] = v.strip()
96
+
97
+ updated_row['filename'] = row.get('source_file', self._empty_marker)
98
+ del updated_row['source_file']
99
+ updated_row['Inventarnummer'] = row.get('Inv. Nr.', self._empty_marker)
100
+ del updated_row['Inv. Nr.']
101
+ updated_row['erworben am'] = row.get('am', self._empty_marker)
102
+ del updated_row['am']
103
+ updated_row['Versicherungswert'] = row.get('Vers.-Wert', self._empty_marker)
104
+ del updated_row['Vers.-Wert']
105
+ measurements, weight = self._handle_masse(row)
106
+ updated_row['Masse'] = measurements
107
+ updated_row['Gewicht'] = weight
108
+ del updated_row['Maße']
109
+ return updated_row
110
+
111
+ class SchmuckPostProcessor(PostProcessor):
112
+ def __init__(self):
113
+ print("Using SchmuckPostProcessor for production mode.")
114
+ super().__init__()
115
+ self._empty_marker = 'Unbekannt'
116
+ self.delimiter = ';' # Use semicolon as delimiter for Schmuck CSV
117
+ # spacy.cli.download("de_core_news_sm")
118
+ # self.nlp = spacy.load("de_core_news_sm")
119
+
120
+
121
+ def _extract_price_and_currency(self, price_str: str) -> tuple:
122
+ def is_donated(price_str):
123
+ if distance(price_str.strip(), 'Stiftung') <= 1:
124
+ return True
125
+ if distance(price_str.strip(), 'Geschenk') <= 1:
126
+ return True
127
+ return False
128
+
129
+ if not price_str or price_str.strip() == '':
130
+ price = 'Unbekannt'
131
+ else:
132
+ price = re.sub(r'[^\d]', '', price_str) # Remove non-digit characters
133
+
134
+ if is_donated(price_str):
135
+ return 0, 'Deutsche Mark'
136
+
137
+ if 'DM' in price_str or 'Dm' in price_str:
138
+ return price, 'Deutsche Mark'
139
+ if 'M' in price_str:
140
+ return price, 'Reichsmark (Deutsches Reich)'
141
+
142
+ return price, 'Deutsche Mark'
143
+
144
+
145
+ def _is_bought(self, row: dict) -> bool:
146
+ erworben = row.get('erworben von', '').strip()
147
+ if erworben.lower() == 'stiftung':
148
+ return False
149
+ if not row['Preis'] or row['Preis'].strip() == '':
150
+ return False
151
+ return True
152
+
153
+ def _extract_notes(self, row: dict) -> str | None:
154
+ notes = row.get('Literatur')
155
+ if not self._is_bought(row) and row.get('erworben von') != '':
156
+ notes += f"Angaben aus dem Inventarkartenfeld 'erworben von': {row.get('erworben von')}"
157
+ return notes
158
+
159
+ def _extract_standort(self, standort: str) -> str:
160
+ if not standort or standort.strip() == '':
161
+ return self._empty_marker
162
+ return "alter Standort: " + standort
163
+
164
+
165
+ def _extract_erwerb(self, row: dict) -> list:
166
+ # erworben_doc = self.nlp(row.get('erworben von', ''))
167
+ # persons = [ent.text for ent in erworben_doc.ents if ent.label_ == 'PER']
168
+ # places = [ent.text for ent in erworben_doc.ents if ent.label_ == 'LOC']
169
+ # TODO
170
+ erworben_str = row.get('erworben von')
171
+ preis_str = row.get('Preis')
172
+ matches = re.search("Hersteller|Entwurf|Ausführung|Herst.|Entw.|Ausf.", erworben_str, flags=re.IGNORECASE)
173
+ if not matches:
174
+ row = row
175
+ return row
176
+
177
+ def _extract_description(self, row: dict) -> str:
178
+ DEFAULT_DESCRIPTION = 'Dieses Schmuckstück ist aus dem historischen Schmuckinventar der Kunstgewerbeschule Pforzheim.'
179
+ beschreibung = row.get('Beschreibung', DEFAULT_DESCRIPTION)
180
+ if not beschreibung or beschreibung.strip() == '':
181
+ beschreibung = DEFAULT_DESCRIPTION
182
+ return beschreibung
183
+
184
+
185
+ def _update_one_entry(self, row: dict) -> dict:
186
+ """
187
+ Update a single entry in the row based on rules.
188
+ """
189
+ unchanged_keys = []
190
+ def get_or_default(row: dict, key: str, default=self._empty_marker) -> str:
191
+ value = row.get(key, '')
192
+ if value is None or value.strip() == '':
193
+ return default
194
+ return value
195
+
196
+ updated_row = {}
197
+ for k in unchanged_keys:
198
+ updated_row[k] = get_or_default(row, k)
199
+
200
+ updated_row['object_title'] = get_or_default(row, 'Gegenstand')
201
+ updated_row['object_type'] = "Schmuck"
202
+ updated_row['inventory_number'] = get_or_default(row, 'Inv. Nr.')
203
+
204
+ updated_row['remarks_short'] = get_or_default(row, 'source_file')
205
+ updated_row['remarks_long'] = get_or_default(row, 'Maße')
206
+ updated_row['literature_title1'] = get_or_default(row, 'Literatur')
207
+
208
+
209
+ updated_row['abode_regular'] = self._extract_standort(row.get('Standort'))
210
+ updated_row["abode_actual"] = "Schmuckmuseum Pforzheim"
211
+
212
+ updated_row['material_separate'] = get_or_default(row, 'Material')
213
+ updated_row['object_description'] = self._extract_description(row)
214
+
215
+ insurance_value, insurance_value_currency = self._extract_price_and_currency(row.get('Vers.-Wert', ''))
216
+ updated_row['worth_insurance_value'] = insurance_value
217
+ updated_row['worth_insurance_unit'] = insurance_value_currency
218
+
219
+ # updated_row['Notizen'] = self._extract_notes(row) or empty_marker
220
+ updated_row['exhibition_name1'] = get_or_default(row, 'Ausstellungen')
221
+
222
+ updated_row['image_name1'] = get_or_default(row,'Foto Notes')
223
+ updated_row['image_owner1'] = 'Schmuckmuseum Pforzheim'
224
+ updated_row['image_rights1'] = 'RR-R'
225
+ updated_row['image_visible1'] = 'y'
226
+ updated_row['image_main1'] = 'y'
227
+
228
+
229
+ updated_row['form_designed_when1'] = get_or_default(row, 'Datierung')
230
+ updated_row['form_designed_who1'] = self._empty_marker
231
+ updated_row['form_designed_where1'] = get_or_default(row, 'Herkunft')
232
+
233
+ # internal fields
234
+ updated_row['acquisition_type'] = self._empty_marker
235
+ updated_row['acquisition_name'] = 'Erwerb'
236
+ updated_row['acquisition_source_name'] = get_or_default(row, 'erworben von')
237
+ updated_row['acquisition_date'] = get_or_default(row, 'am', default='3000-01-01')
238
+ acquisition_price, acquisition_price_currency = self._extract_price_and_currency(row.get('Preis', ''))
239
+ updated_row['acquisition_price'] = acquisition_price
240
+ updated_row['acquisition_price_currency'] = acquisition_price_currency
241
+ updated_row['acquisition_note'] = "Art des Zugangs ist zu überprüfen."
242
+
243
+ # copied from acquisition for potential publication
244
+ updated_row['received_ownership_when1'] = updated_row['acquisition_date']
245
+ updated_row['received_ownership_who1'] = updated_row['acquisition_source_name']
246
+ updated_row['received_ownership_where1'] = 'Pforzheim'
247
+ updated_row['received_ownership_where_sure1'] = 'n'
248
+
249
+ return updated_row