winebox 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ """Image storage service for managing wine label images."""
2
+
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ import aiofiles
7
+ from fastapi import UploadFile
8
+
9
+ from winebox.config import settings
10
+
11
+
12
+ class ImageStorageService:
13
+ """Service for storing and managing wine label images."""
14
+
15
+ def __init__(self, storage_path: Path | None = None) -> None:
16
+ """Initialize the image storage service.
17
+
18
+ Args:
19
+ storage_path: Path to store images. Defaults to config setting.
20
+ """
21
+ self.storage_path = storage_path or settings.image_storage_path
22
+ self.storage_path.mkdir(parents=True, exist_ok=True)
23
+
24
+ async def save_image(self, upload_file: UploadFile) -> str:
25
+ """Save an uploaded image file.
26
+
27
+ Args:
28
+ upload_file: The uploaded file from FastAPI.
29
+
30
+ Returns:
31
+ The filename of the saved image.
32
+ """
33
+ # Generate unique filename
34
+ ext = Path(upload_file.filename or "image.jpg").suffix.lower()
35
+ if ext not in [".jpg", ".jpeg", ".png", ".gif", ".webp"]:
36
+ ext = ".jpg"
37
+
38
+ filename = f"{uuid.uuid4()}{ext}"
39
+ file_path = self.storage_path / filename
40
+
41
+ # Save file
42
+ content = await upload_file.read()
43
+ async with aiofiles.open(file_path, "wb") as f:
44
+ await f.write(content)
45
+
46
+ return filename
47
+
48
+ async def delete_image(self, filename: str) -> bool:
49
+ """Delete an image file.
50
+
51
+ Args:
52
+ filename: The filename to delete.
53
+
54
+ Returns:
55
+ True if deleted, False if not found.
56
+ """
57
+ file_path = self.storage_path / filename
58
+
59
+ if file_path.exists():
60
+ file_path.unlink()
61
+ return True
62
+
63
+ return False
64
+
65
+ def get_image_path(self, filename: str) -> Path | None:
66
+ """Get the full path to an image file.
67
+
68
+ Args:
69
+ filename: The filename to look up.
70
+
71
+ Returns:
72
+ The full path if file exists, None otherwise.
73
+ """
74
+ file_path = self.storage_path / filename
75
+
76
+ if file_path.exists():
77
+ return file_path
78
+
79
+ return None
80
+
81
+ def get_image_url(self, filename: str) -> str:
82
+ """Get the URL path for an image.
83
+
84
+ Args:
85
+ filename: The image filename.
86
+
87
+ Returns:
88
+ The URL path to access the image.
89
+ """
90
+ return f"/api/images/{filename}"
@@ -0,0 +1,128 @@
1
+ """OCR service for extracting text from wine label images."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from PIL import Image
7
+
8
+ from winebox.config import settings
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class OCRService:
14
+ """Service for extracting text from images using Tesseract OCR."""
15
+
16
+ def __init__(self) -> None:
17
+ """Initialize the OCR service."""
18
+ # Configure Tesseract command if specified
19
+ if settings.tesseract_cmd:
20
+ import pytesseract
21
+ pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
22
+
23
+ async def extract_text(self, image_path: str | Path) -> str:
24
+ """Extract text from an image file.
25
+
26
+ Args:
27
+ image_path: Path to the image file (relative to image storage).
28
+
29
+ Returns:
30
+ Extracted text from the image.
31
+ """
32
+ try:
33
+ import pytesseract
34
+
35
+ # Build full path if relative
36
+ if isinstance(image_path, str) and not Path(image_path).is_absolute():
37
+ full_path = settings.image_storage_path / image_path
38
+ else:
39
+ full_path = Path(image_path)
40
+
41
+ if not full_path.exists():
42
+ logger.warning(f"Image file not found: {full_path}")
43
+ return ""
44
+
45
+ # Open image and extract text
46
+ image = Image.open(full_path)
47
+
48
+ # Preprocess image for better OCR results
49
+ # Convert to grayscale
50
+ if image.mode != "L":
51
+ image = image.convert("L")
52
+
53
+ # Extract text
54
+ text = pytesseract.image_to_string(
55
+ image,
56
+ lang="eng",
57
+ config="--psm 6", # Assume uniform block of text
58
+ )
59
+
60
+ return text.strip()
61
+
62
+ except ImportError:
63
+ logger.error("pytesseract is not installed")
64
+ return ""
65
+ except Exception as e:
66
+ logger.error(f"OCR extraction failed: {e}")
67
+ return ""
68
+
69
+ async def extract_text_with_confidence(
70
+ self, image_path: str | Path
71
+ ) -> tuple[str, float]:
72
+ """Extract text from an image with confidence score.
73
+
74
+ Args:
75
+ image_path: Path to the image file.
76
+
77
+ Returns:
78
+ Tuple of (extracted_text, average_confidence).
79
+ """
80
+ try:
81
+ import pytesseract
82
+
83
+ # Build full path if relative
84
+ if isinstance(image_path, str) and not Path(image_path).is_absolute():
85
+ full_path = settings.image_storage_path / image_path
86
+ else:
87
+ full_path = Path(image_path)
88
+
89
+ if not full_path.exists():
90
+ logger.warning(f"Image file not found: {full_path}")
91
+ return "", 0.0
92
+
93
+ image = Image.open(full_path)
94
+
95
+ if image.mode != "L":
96
+ image = image.convert("L")
97
+
98
+ # Get detailed data with confidence
99
+ data = pytesseract.image_to_data(
100
+ image,
101
+ lang="eng",
102
+ config="--psm 6",
103
+ output_type=pytesseract.Output.DICT,
104
+ )
105
+
106
+ # Calculate average confidence for words
107
+ confidences = [
108
+ conf
109
+ for conf, text in zip(data["conf"], data["text"])
110
+ if conf > 0 and text.strip()
111
+ ]
112
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
113
+
114
+ # Get full text
115
+ text = pytesseract.image_to_string(
116
+ image,
117
+ lang="eng",
118
+ config="--psm 6",
119
+ )
120
+
121
+ return text.strip(), avg_confidence
122
+
123
+ except ImportError:
124
+ logger.error("pytesseract is not installed")
125
+ return "", 0.0
126
+ except Exception as e:
127
+ logger.error(f"OCR extraction failed: {e}")
128
+ return "", 0.0
@@ -0,0 +1,411 @@
1
+ """Wine parser service for extracting structured data from OCR text."""
2
+
3
+ import logging
4
+ import re
5
+ from typing import Any
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Common grape varieties
10
+ GRAPE_VARIETIES = [
11
+ "Cabernet Sauvignon",
12
+ "Cabernet",
13
+ "Merlot",
14
+ "Pinot Noir",
15
+ "Pinot Grigio",
16
+ "Pinot Gris",
17
+ "Chardonnay",
18
+ "Sauvignon Blanc",
19
+ "Riesling",
20
+ "Syrah",
21
+ "Shiraz",
22
+ "Zinfandel",
23
+ "Malbec",
24
+ "Tempranillo",
25
+ "Sangiovese",
26
+ "Nebbiolo",
27
+ "Grenache",
28
+ "Garnacha",
29
+ "Mourvedre",
30
+ "Viognier",
31
+ "Gewurztraminer",
32
+ "Chenin Blanc",
33
+ "Semillon",
34
+ "Muscat",
35
+ "Moscato",
36
+ "Prosecco",
37
+ "Champagne",
38
+ "Cava",
39
+ "Albarino",
40
+ "Gruner Veltliner",
41
+ "Torrontes",
42
+ "Verdejo",
43
+ "Carmenere",
44
+ "Petite Sirah",
45
+ "Petit Verdot",
46
+ "Barbera",
47
+ "Primitivo",
48
+ "Montepulciano",
49
+ "Nero d'Avola",
50
+ "Vermentino",
51
+ "Fiano",
52
+ "Trebbiano",
53
+ "Corvina",
54
+ "Gamay",
55
+ "Beaujolais",
56
+ ]
57
+
58
+ # Common wine regions
59
+ WINE_REGIONS = [
60
+ # France
61
+ "Bordeaux",
62
+ "Burgundy",
63
+ "Bourgogne",
64
+ "Champagne",
65
+ "Rhone",
66
+ "Loire",
67
+ "Alsace",
68
+ "Provence",
69
+ "Languedoc",
70
+ "Cotes du Rhone",
71
+ "Medoc",
72
+ "Saint-Emilion",
73
+ "Pauillac",
74
+ "Margaux",
75
+ "Chablis",
76
+ "Beaune",
77
+ "Sancerre",
78
+ "Pouilly-Fume",
79
+ # Italy
80
+ "Tuscany",
81
+ "Toscana",
82
+ "Piedmont",
83
+ "Piemonte",
84
+ "Veneto",
85
+ "Sicily",
86
+ "Sicilia",
87
+ "Chianti",
88
+ "Barolo",
89
+ "Barbaresco",
90
+ "Brunello di Montalcino",
91
+ "Montalcino",
92
+ "Valpolicella",
93
+ "Amarone",
94
+ "Prosecco",
95
+ "Friuli",
96
+ # Spain
97
+ "Rioja",
98
+ "Ribera del Duero",
99
+ "Priorat",
100
+ "Rias Baixas",
101
+ "Rueda",
102
+ "Navarra",
103
+ "La Mancha",
104
+ "Jerez",
105
+ "Sherry",
106
+ # USA
107
+ "Napa Valley",
108
+ "Napa",
109
+ "Sonoma",
110
+ "Willamette Valley",
111
+ "Paso Robles",
112
+ "Santa Barbara",
113
+ "Russian River",
114
+ "Alexander Valley",
115
+ "Central Coast",
116
+ "Oregon",
117
+ "Washington",
118
+ # Other
119
+ "Marlborough",
120
+ "Mendoza",
121
+ "Stellenbosch",
122
+ "Mosel",
123
+ "Rheingau",
124
+ "Pfalz",
125
+ "Douro",
126
+ "Alentejo",
127
+ "Barossa Valley",
128
+ "Hunter Valley",
129
+ "Margaret River",
130
+ "Hawke's Bay",
131
+ "Maipo Valley",
132
+ "Colchagua",
133
+ "Casablanca",
134
+ ]
135
+
136
+ # Countries commonly associated with wine
137
+ WINE_COUNTRIES = [
138
+ "France",
139
+ "Italy",
140
+ "Spain",
141
+ "Portugal",
142
+ "Germany",
143
+ "Austria",
144
+ "United States",
145
+ "USA",
146
+ "California",
147
+ "Oregon",
148
+ "Washington",
149
+ "Australia",
150
+ "New Zealand",
151
+ "Argentina",
152
+ "Chile",
153
+ "South Africa",
154
+ "Greece",
155
+ "Lebanon",
156
+ "Israel",
157
+ "Canada",
158
+ "Hungary",
159
+ "Romania",
160
+ "Georgia",
161
+ "Slovenia",
162
+ "Croatia",
163
+ ]
164
+
165
+
166
+ class WineParserService:
167
+ """Service for parsing wine information from OCR text."""
168
+
169
+ def parse(self, text: str) -> dict[str, Any]:
170
+ """Parse OCR text to extract wine information.
171
+
172
+ Args:
173
+ text: Raw OCR text from wine labels.
174
+
175
+ Returns:
176
+ Dictionary with extracted wine information.
177
+ """
178
+ result: dict[str, Any] = {}
179
+
180
+ if not text:
181
+ return result
182
+
183
+ # Clean text
184
+ text_clean = text.strip()
185
+ text_upper = text_clean.upper()
186
+ text_lower = text_clean.lower()
187
+
188
+ # Extract vintage year
189
+ vintage = self._extract_vintage(text_clean)
190
+ if vintage:
191
+ result["vintage"] = vintage
192
+
193
+ # Extract alcohol percentage
194
+ alcohol = self._extract_alcohol(text_clean)
195
+ if alcohol:
196
+ result["alcohol_percentage"] = alcohol
197
+
198
+ # Extract grape variety
199
+ grape = self._extract_grape_variety(text_clean)
200
+ if grape:
201
+ result["grape_variety"] = grape
202
+
203
+ # Extract region
204
+ region = self._extract_region(text_clean)
205
+ if region:
206
+ result["region"] = region
207
+
208
+ # Extract country
209
+ country = self._extract_country(text_clean)
210
+ if country:
211
+ result["country"] = country
212
+
213
+ # Try to extract winery name (usually at the top of front label)
214
+ winery = self._extract_winery(text_clean)
215
+ if winery:
216
+ result["winery"] = winery
217
+
218
+ # Try to extract wine name
219
+ name = self._extract_name(text_clean, result)
220
+ if name:
221
+ result["name"] = name
222
+
223
+ return result
224
+
225
+ def _extract_vintage(self, text: str) -> int | None:
226
+ """Extract vintage year from text."""
227
+ # Look for 4-digit years between 1900 and current year + 2
228
+ pattern = r"\b(19\d{2}|20[0-2]\d)\b"
229
+ matches = re.findall(pattern, text)
230
+
231
+ if matches:
232
+ # Prefer years that look like vintages (not recent years like current year)
233
+ for year_str in matches:
234
+ year = int(year_str)
235
+ if 1950 <= year <= 2025:
236
+ return year
237
+
238
+ # Fall back to first found year
239
+ return int(matches[0])
240
+
241
+ return None
242
+
243
+ def _extract_alcohol(self, text: str) -> float | None:
244
+ """Extract alcohol percentage from text."""
245
+ # Various patterns for alcohol content
246
+ patterns = [
247
+ r"(\d{1,2}[.,]\d{1,2})\s*%\s*(?:vol|alc|alcohol|abv)?",
248
+ r"(?:alc|alcohol|abv)[:\s]*(\d{1,2}[.,]\d{1,2})\s*%",
249
+ r"(\d{1,2}[.,]\d{1,2})\s*%\s*vol",
250
+ r"(\d{1,2})\s*%\s*(?:vol|alc|alcohol|abv)",
251
+ ]
252
+
253
+ for pattern in patterns:
254
+ match = re.search(pattern, text, re.IGNORECASE)
255
+ if match:
256
+ value = match.group(1).replace(",", ".")
257
+ try:
258
+ alcohol = float(value)
259
+ if 5.0 <= alcohol <= 25.0: # Reasonable wine alcohol range
260
+ return alcohol
261
+ except ValueError:
262
+ continue
263
+
264
+ return None
265
+
266
+ def _extract_grape_variety(self, text: str) -> str | None:
267
+ """Extract grape variety from text."""
268
+ text_lower = text.lower()
269
+
270
+ for grape in GRAPE_VARIETIES:
271
+ if grape.lower() in text_lower:
272
+ return grape
273
+
274
+ return None
275
+
276
+ def _extract_region(self, text: str) -> str | None:
277
+ """Extract wine region from text."""
278
+ text_lower = text.lower()
279
+
280
+ for region in WINE_REGIONS:
281
+ if region.lower() in text_lower:
282
+ return region
283
+
284
+ return None
285
+
286
+ def _extract_country(self, text: str) -> str | None:
287
+ """Extract country from text."""
288
+ text_lower = text.lower()
289
+
290
+ # Direct country mentions
291
+ for country in WINE_COUNTRIES:
292
+ if country.lower() in text_lower:
293
+ # Normalize some entries
294
+ if country in ["California", "Oregon", "Washington"]:
295
+ return "United States"
296
+ if country == "USA":
297
+ return "United States"
298
+ return country
299
+
300
+ # Infer from region if possible
301
+ region = self._extract_region(text)
302
+ if region:
303
+ region_to_country = {
304
+ "Bordeaux": "France",
305
+ "Burgundy": "France",
306
+ "Champagne": "France",
307
+ "Tuscany": "Italy",
308
+ "Piedmont": "Italy",
309
+ "Rioja": "Spain",
310
+ "Napa Valley": "United States",
311
+ "Marlborough": "New Zealand",
312
+ "Mendoza": "Argentina",
313
+ "Barossa Valley": "Australia",
314
+ }
315
+ return region_to_country.get(region)
316
+
317
+ return None
318
+
319
+ def _extract_winery(self, text: str) -> str | None:
320
+ """Extract winery name from text.
321
+
322
+ Typically the winery name appears at the top of the label,
323
+ often in larger text. This is a simple heuristic.
324
+ """
325
+ lines = text.strip().split("\n")
326
+
327
+ # First non-empty line that's not a year or standard phrase
328
+ for line in lines[:5]: # Check first 5 lines
329
+ line = line.strip()
330
+ if not line:
331
+ continue
332
+
333
+ # Skip if it's just a year
334
+ if re.match(r"^\d{4}$", line):
335
+ continue
336
+
337
+ # Skip if it's a common label phrase
338
+ skip_phrases = [
339
+ "product of",
340
+ "produced by",
341
+ "bottled by",
342
+ "imported by",
343
+ "contains sulfites",
344
+ "alcohol",
345
+ "estate",
346
+ "reserve",
347
+ ]
348
+ if any(phrase in line.lower() for phrase in skip_phrases):
349
+ continue
350
+
351
+ # Skip if it looks like alcohol content
352
+ if re.search(r"\d+[.,]?\d*\s*%", line):
353
+ continue
354
+
355
+ # If line has reasonable length, might be winery name
356
+ if 3 <= len(line) <= 50:
357
+ return line
358
+
359
+ return None
360
+
361
+ def _extract_name(self, text: str, parsed: dict) -> str | None:
362
+ """Extract wine name from text.
363
+
364
+ Often the wine name includes grape variety, region, or
365
+ is a distinctive name on the label.
366
+ """
367
+ lines = text.strip().split("\n")
368
+
369
+ # Try to find a distinctive wine name
370
+ candidates = []
371
+
372
+ for line in lines:
373
+ line = line.strip()
374
+ if not line or len(line) < 3:
375
+ continue
376
+
377
+ # Check if line could be a wine name
378
+ # Skip very short or very long lines
379
+ if len(line) < 3 or len(line) > 60:
380
+ continue
381
+
382
+ # Skip year-only lines
383
+ if re.match(r"^\d{4}$", line):
384
+ continue
385
+
386
+ # Skip alcohol percentage lines
387
+ if re.search(r"\d+[.,]?\d*\s*%", line):
388
+ continue
389
+
390
+ candidates.append(line)
391
+
392
+ if candidates:
393
+ # Prefer line with grape variety or vintage if present
394
+ grape = parsed.get("grape_variety", "")
395
+ vintage = parsed.get("vintage")
396
+
397
+ for candidate in candidates:
398
+ if grape and grape.lower() in candidate.lower():
399
+ return candidate
400
+ if vintage and str(vintage) in candidate:
401
+ return candidate
402
+
403
+ # Fall back to first candidate that's not the winery
404
+ winery = parsed.get("winery", "")
405
+ for candidate in candidates:
406
+ if candidate != winery:
407
+ return candidate
408
+
409
+ return candidates[0] if candidates else None
410
+
411
+ return None