winebox 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- winebox/__init__.py +3 -0
- winebox/cli/__init__.py +1 -0
- winebox/cli/server.py +313 -0
- winebox/cli/user_admin.py +258 -0
- winebox/config.py +43 -0
- winebox/database.py +47 -0
- winebox/main.py +78 -0
- winebox/models/__init__.py +8 -0
- winebox/models/inventory.py +46 -0
- winebox/models/transaction.py +64 -0
- winebox/models/user.py +55 -0
- winebox/models/wine.py +66 -0
- winebox/routers/__init__.py +5 -0
- winebox/routers/auth.py +90 -0
- winebox/routers/cellar.py +102 -0
- winebox/routers/search.py +127 -0
- winebox/routers/transactions.py +63 -0
- winebox/routers/wines.py +287 -0
- winebox/schemas/__init__.py +13 -0
- winebox/schemas/transaction.py +40 -0
- winebox/schemas/wine.py +79 -0
- winebox/services/__init__.py +7 -0
- winebox/services/auth.py +123 -0
- winebox/services/image_storage.py +90 -0
- winebox/services/ocr.py +128 -0
- winebox/services/wine_parser.py +411 -0
- winebox/static/css/style.css +1086 -0
- winebox/static/index.html +271 -0
- winebox/static/js/app.js +703 -0
- winebox-0.1.0.dist-info/METADATA +283 -0
- winebox-0.1.0.dist-info/RECORD +34 -0
- winebox-0.1.0.dist-info/WHEEL +4 -0
- winebox-0.1.0.dist-info/entry_points.txt +3 -0
- winebox-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Image storage service for managing wine label images."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import aiofiles
|
|
7
|
+
from fastapi import UploadFile
|
|
8
|
+
|
|
9
|
+
from winebox.config import settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ImageStorageService:
|
|
13
|
+
"""Service for storing and managing wine label images."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, storage_path: Path | None = None) -> None:
|
|
16
|
+
"""Initialize the image storage service.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
storage_path: Path to store images. Defaults to config setting.
|
|
20
|
+
"""
|
|
21
|
+
self.storage_path = storage_path or settings.image_storage_path
|
|
22
|
+
self.storage_path.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
async def save_image(self, upload_file: UploadFile) -> str:
|
|
25
|
+
"""Save an uploaded image file.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
upload_file: The uploaded file from FastAPI.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
The filename of the saved image.
|
|
32
|
+
"""
|
|
33
|
+
# Generate unique filename
|
|
34
|
+
ext = Path(upload_file.filename or "image.jpg").suffix.lower()
|
|
35
|
+
if ext not in [".jpg", ".jpeg", ".png", ".gif", ".webp"]:
|
|
36
|
+
ext = ".jpg"
|
|
37
|
+
|
|
38
|
+
filename = f"{uuid.uuid4()}{ext}"
|
|
39
|
+
file_path = self.storage_path / filename
|
|
40
|
+
|
|
41
|
+
# Save file
|
|
42
|
+
content = await upload_file.read()
|
|
43
|
+
async with aiofiles.open(file_path, "wb") as f:
|
|
44
|
+
await f.write(content)
|
|
45
|
+
|
|
46
|
+
return filename
|
|
47
|
+
|
|
48
|
+
async def delete_image(self, filename: str) -> bool:
|
|
49
|
+
"""Delete an image file.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
filename: The filename to delete.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
True if deleted, False if not found.
|
|
56
|
+
"""
|
|
57
|
+
file_path = self.storage_path / filename
|
|
58
|
+
|
|
59
|
+
if file_path.exists():
|
|
60
|
+
file_path.unlink()
|
|
61
|
+
return True
|
|
62
|
+
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
def get_image_path(self, filename: str) -> Path | None:
|
|
66
|
+
"""Get the full path to an image file.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
filename: The filename to look up.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
The full path if file exists, None otherwise.
|
|
73
|
+
"""
|
|
74
|
+
file_path = self.storage_path / filename
|
|
75
|
+
|
|
76
|
+
if file_path.exists():
|
|
77
|
+
return file_path
|
|
78
|
+
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
def get_image_url(self, filename: str) -> str:
|
|
82
|
+
"""Get the URL path for an image.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
filename: The image filename.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
The URL path to access the image.
|
|
89
|
+
"""
|
|
90
|
+
return f"/api/images/{filename}"
|
winebox/services/ocr.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""OCR service for extracting text from wine label images."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
from winebox.config import settings
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OCRService:
|
|
14
|
+
"""Service for extracting text from images using Tesseract OCR."""
|
|
15
|
+
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
"""Initialize the OCR service."""
|
|
18
|
+
# Configure Tesseract command if specified
|
|
19
|
+
if settings.tesseract_cmd:
|
|
20
|
+
import pytesseract
|
|
21
|
+
pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
|
|
22
|
+
|
|
23
|
+
async def extract_text(self, image_path: str | Path) -> str:
|
|
24
|
+
"""Extract text from an image file.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
image_path: Path to the image file (relative to image storage).
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Extracted text from the image.
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
import pytesseract
|
|
34
|
+
|
|
35
|
+
# Build full path if relative
|
|
36
|
+
if isinstance(image_path, str) and not Path(image_path).is_absolute():
|
|
37
|
+
full_path = settings.image_storage_path / image_path
|
|
38
|
+
else:
|
|
39
|
+
full_path = Path(image_path)
|
|
40
|
+
|
|
41
|
+
if not full_path.exists():
|
|
42
|
+
logger.warning(f"Image file not found: {full_path}")
|
|
43
|
+
return ""
|
|
44
|
+
|
|
45
|
+
# Open image and extract text
|
|
46
|
+
image = Image.open(full_path)
|
|
47
|
+
|
|
48
|
+
# Preprocess image for better OCR results
|
|
49
|
+
# Convert to grayscale
|
|
50
|
+
if image.mode != "L":
|
|
51
|
+
image = image.convert("L")
|
|
52
|
+
|
|
53
|
+
# Extract text
|
|
54
|
+
text = pytesseract.image_to_string(
|
|
55
|
+
image,
|
|
56
|
+
lang="eng",
|
|
57
|
+
config="--psm 6", # Assume uniform block of text
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
return text.strip()
|
|
61
|
+
|
|
62
|
+
except ImportError:
|
|
63
|
+
logger.error("pytesseract is not installed")
|
|
64
|
+
return ""
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f"OCR extraction failed: {e}")
|
|
67
|
+
return ""
|
|
68
|
+
|
|
69
|
+
async def extract_text_with_confidence(
|
|
70
|
+
self, image_path: str | Path
|
|
71
|
+
) -> tuple[str, float]:
|
|
72
|
+
"""Extract text from an image with confidence score.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
image_path: Path to the image file.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Tuple of (extracted_text, average_confidence).
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
import pytesseract
|
|
82
|
+
|
|
83
|
+
# Build full path if relative
|
|
84
|
+
if isinstance(image_path, str) and not Path(image_path).is_absolute():
|
|
85
|
+
full_path = settings.image_storage_path / image_path
|
|
86
|
+
else:
|
|
87
|
+
full_path = Path(image_path)
|
|
88
|
+
|
|
89
|
+
if not full_path.exists():
|
|
90
|
+
logger.warning(f"Image file not found: {full_path}")
|
|
91
|
+
return "", 0.0
|
|
92
|
+
|
|
93
|
+
image = Image.open(full_path)
|
|
94
|
+
|
|
95
|
+
if image.mode != "L":
|
|
96
|
+
image = image.convert("L")
|
|
97
|
+
|
|
98
|
+
# Get detailed data with confidence
|
|
99
|
+
data = pytesseract.image_to_data(
|
|
100
|
+
image,
|
|
101
|
+
lang="eng",
|
|
102
|
+
config="--psm 6",
|
|
103
|
+
output_type=pytesseract.Output.DICT,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Calculate average confidence for words
|
|
107
|
+
confidences = [
|
|
108
|
+
conf
|
|
109
|
+
for conf, text in zip(data["conf"], data["text"])
|
|
110
|
+
if conf > 0 and text.strip()
|
|
111
|
+
]
|
|
112
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
|
113
|
+
|
|
114
|
+
# Get full text
|
|
115
|
+
text = pytesseract.image_to_string(
|
|
116
|
+
image,
|
|
117
|
+
lang="eng",
|
|
118
|
+
config="--psm 6",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return text.strip(), avg_confidence
|
|
122
|
+
|
|
123
|
+
except ImportError:
|
|
124
|
+
logger.error("pytesseract is not installed")
|
|
125
|
+
return "", 0.0
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"OCR extraction failed: {e}")
|
|
128
|
+
return "", 0.0
|
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
"""Wine parser service for extracting structured data from OCR text."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
# Common grape varieties
|
|
10
|
+
GRAPE_VARIETIES = [
|
|
11
|
+
"Cabernet Sauvignon",
|
|
12
|
+
"Cabernet",
|
|
13
|
+
"Merlot",
|
|
14
|
+
"Pinot Noir",
|
|
15
|
+
"Pinot Grigio",
|
|
16
|
+
"Pinot Gris",
|
|
17
|
+
"Chardonnay",
|
|
18
|
+
"Sauvignon Blanc",
|
|
19
|
+
"Riesling",
|
|
20
|
+
"Syrah",
|
|
21
|
+
"Shiraz",
|
|
22
|
+
"Zinfandel",
|
|
23
|
+
"Malbec",
|
|
24
|
+
"Tempranillo",
|
|
25
|
+
"Sangiovese",
|
|
26
|
+
"Nebbiolo",
|
|
27
|
+
"Grenache",
|
|
28
|
+
"Garnacha",
|
|
29
|
+
"Mourvedre",
|
|
30
|
+
"Viognier",
|
|
31
|
+
"Gewurztraminer",
|
|
32
|
+
"Chenin Blanc",
|
|
33
|
+
"Semillon",
|
|
34
|
+
"Muscat",
|
|
35
|
+
"Moscato",
|
|
36
|
+
"Prosecco",
|
|
37
|
+
"Champagne",
|
|
38
|
+
"Cava",
|
|
39
|
+
"Albarino",
|
|
40
|
+
"Gruner Veltliner",
|
|
41
|
+
"Torrontes",
|
|
42
|
+
"Verdejo",
|
|
43
|
+
"Carmenere",
|
|
44
|
+
"Petite Sirah",
|
|
45
|
+
"Petit Verdot",
|
|
46
|
+
"Barbera",
|
|
47
|
+
"Primitivo",
|
|
48
|
+
"Montepulciano",
|
|
49
|
+
"Nero d'Avola",
|
|
50
|
+
"Vermentino",
|
|
51
|
+
"Fiano",
|
|
52
|
+
"Trebbiano",
|
|
53
|
+
"Corvina",
|
|
54
|
+
"Gamay",
|
|
55
|
+
"Beaujolais",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
# Common wine regions
|
|
59
|
+
WINE_REGIONS = [
|
|
60
|
+
# France
|
|
61
|
+
"Bordeaux",
|
|
62
|
+
"Burgundy",
|
|
63
|
+
"Bourgogne",
|
|
64
|
+
"Champagne",
|
|
65
|
+
"Rhone",
|
|
66
|
+
"Loire",
|
|
67
|
+
"Alsace",
|
|
68
|
+
"Provence",
|
|
69
|
+
"Languedoc",
|
|
70
|
+
"Cotes du Rhone",
|
|
71
|
+
"Medoc",
|
|
72
|
+
"Saint-Emilion",
|
|
73
|
+
"Pauillac",
|
|
74
|
+
"Margaux",
|
|
75
|
+
"Chablis",
|
|
76
|
+
"Beaune",
|
|
77
|
+
"Sancerre",
|
|
78
|
+
"Pouilly-Fume",
|
|
79
|
+
# Italy
|
|
80
|
+
"Tuscany",
|
|
81
|
+
"Toscana",
|
|
82
|
+
"Piedmont",
|
|
83
|
+
"Piemonte",
|
|
84
|
+
"Veneto",
|
|
85
|
+
"Sicily",
|
|
86
|
+
"Sicilia",
|
|
87
|
+
"Chianti",
|
|
88
|
+
"Barolo",
|
|
89
|
+
"Barbaresco",
|
|
90
|
+
"Brunello di Montalcino",
|
|
91
|
+
"Montalcino",
|
|
92
|
+
"Valpolicella",
|
|
93
|
+
"Amarone",
|
|
94
|
+
"Prosecco",
|
|
95
|
+
"Friuli",
|
|
96
|
+
# Spain
|
|
97
|
+
"Rioja",
|
|
98
|
+
"Ribera del Duero",
|
|
99
|
+
"Priorat",
|
|
100
|
+
"Rias Baixas",
|
|
101
|
+
"Rueda",
|
|
102
|
+
"Navarra",
|
|
103
|
+
"La Mancha",
|
|
104
|
+
"Jerez",
|
|
105
|
+
"Sherry",
|
|
106
|
+
# USA
|
|
107
|
+
"Napa Valley",
|
|
108
|
+
"Napa",
|
|
109
|
+
"Sonoma",
|
|
110
|
+
"Willamette Valley",
|
|
111
|
+
"Paso Robles",
|
|
112
|
+
"Santa Barbara",
|
|
113
|
+
"Russian River",
|
|
114
|
+
"Alexander Valley",
|
|
115
|
+
"Central Coast",
|
|
116
|
+
"Oregon",
|
|
117
|
+
"Washington",
|
|
118
|
+
# Other
|
|
119
|
+
"Marlborough",
|
|
120
|
+
"Mendoza",
|
|
121
|
+
"Stellenbosch",
|
|
122
|
+
"Mosel",
|
|
123
|
+
"Rheingau",
|
|
124
|
+
"Pfalz",
|
|
125
|
+
"Douro",
|
|
126
|
+
"Alentejo",
|
|
127
|
+
"Barossa Valley",
|
|
128
|
+
"Hunter Valley",
|
|
129
|
+
"Margaret River",
|
|
130
|
+
"Hawke's Bay",
|
|
131
|
+
"Maipo Valley",
|
|
132
|
+
"Colchagua",
|
|
133
|
+
"Casablanca",
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
# Countries commonly associated with wine
|
|
137
|
+
WINE_COUNTRIES = [
|
|
138
|
+
"France",
|
|
139
|
+
"Italy",
|
|
140
|
+
"Spain",
|
|
141
|
+
"Portugal",
|
|
142
|
+
"Germany",
|
|
143
|
+
"Austria",
|
|
144
|
+
"United States",
|
|
145
|
+
"USA",
|
|
146
|
+
"California",
|
|
147
|
+
"Oregon",
|
|
148
|
+
"Washington",
|
|
149
|
+
"Australia",
|
|
150
|
+
"New Zealand",
|
|
151
|
+
"Argentina",
|
|
152
|
+
"Chile",
|
|
153
|
+
"South Africa",
|
|
154
|
+
"Greece",
|
|
155
|
+
"Lebanon",
|
|
156
|
+
"Israel",
|
|
157
|
+
"Canada",
|
|
158
|
+
"Hungary",
|
|
159
|
+
"Romania",
|
|
160
|
+
"Georgia",
|
|
161
|
+
"Slovenia",
|
|
162
|
+
"Croatia",
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class WineParserService:
|
|
167
|
+
"""Service for parsing wine information from OCR text."""
|
|
168
|
+
|
|
169
|
+
def parse(self, text: str) -> dict[str, Any]:
|
|
170
|
+
"""Parse OCR text to extract wine information.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
text: Raw OCR text from wine labels.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Dictionary with extracted wine information.
|
|
177
|
+
"""
|
|
178
|
+
result: dict[str, Any] = {}
|
|
179
|
+
|
|
180
|
+
if not text:
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
# Clean text
|
|
184
|
+
text_clean = text.strip()
|
|
185
|
+
text_upper = text_clean.upper()
|
|
186
|
+
text_lower = text_clean.lower()
|
|
187
|
+
|
|
188
|
+
# Extract vintage year
|
|
189
|
+
vintage = self._extract_vintage(text_clean)
|
|
190
|
+
if vintage:
|
|
191
|
+
result["vintage"] = vintage
|
|
192
|
+
|
|
193
|
+
# Extract alcohol percentage
|
|
194
|
+
alcohol = self._extract_alcohol(text_clean)
|
|
195
|
+
if alcohol:
|
|
196
|
+
result["alcohol_percentage"] = alcohol
|
|
197
|
+
|
|
198
|
+
# Extract grape variety
|
|
199
|
+
grape = self._extract_grape_variety(text_clean)
|
|
200
|
+
if grape:
|
|
201
|
+
result["grape_variety"] = grape
|
|
202
|
+
|
|
203
|
+
# Extract region
|
|
204
|
+
region = self._extract_region(text_clean)
|
|
205
|
+
if region:
|
|
206
|
+
result["region"] = region
|
|
207
|
+
|
|
208
|
+
# Extract country
|
|
209
|
+
country = self._extract_country(text_clean)
|
|
210
|
+
if country:
|
|
211
|
+
result["country"] = country
|
|
212
|
+
|
|
213
|
+
# Try to extract winery name (usually at the top of front label)
|
|
214
|
+
winery = self._extract_winery(text_clean)
|
|
215
|
+
if winery:
|
|
216
|
+
result["winery"] = winery
|
|
217
|
+
|
|
218
|
+
# Try to extract wine name
|
|
219
|
+
name = self._extract_name(text_clean, result)
|
|
220
|
+
if name:
|
|
221
|
+
result["name"] = name
|
|
222
|
+
|
|
223
|
+
return result
|
|
224
|
+
|
|
225
|
+
def _extract_vintage(self, text: str) -> int | None:
|
|
226
|
+
"""Extract vintage year from text."""
|
|
227
|
+
# Look for 4-digit years between 1900 and current year + 2
|
|
228
|
+
pattern = r"\b(19\d{2}|20[0-2]\d)\b"
|
|
229
|
+
matches = re.findall(pattern, text)
|
|
230
|
+
|
|
231
|
+
if matches:
|
|
232
|
+
# Prefer years that look like vintages (not recent years like current year)
|
|
233
|
+
for year_str in matches:
|
|
234
|
+
year = int(year_str)
|
|
235
|
+
if 1950 <= year <= 2025:
|
|
236
|
+
return year
|
|
237
|
+
|
|
238
|
+
# Fall back to first found year
|
|
239
|
+
return int(matches[0])
|
|
240
|
+
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
def _extract_alcohol(self, text: str) -> float | None:
|
|
244
|
+
"""Extract alcohol percentage from text."""
|
|
245
|
+
# Various patterns for alcohol content
|
|
246
|
+
patterns = [
|
|
247
|
+
r"(\d{1,2}[.,]\d{1,2})\s*%\s*(?:vol|alc|alcohol|abv)?",
|
|
248
|
+
r"(?:alc|alcohol|abv)[:\s]*(\d{1,2}[.,]\d{1,2})\s*%",
|
|
249
|
+
r"(\d{1,2}[.,]\d{1,2})\s*%\s*vol",
|
|
250
|
+
r"(\d{1,2})\s*%\s*(?:vol|alc|alcohol|abv)",
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
for pattern in patterns:
|
|
254
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
|
255
|
+
if match:
|
|
256
|
+
value = match.group(1).replace(",", ".")
|
|
257
|
+
try:
|
|
258
|
+
alcohol = float(value)
|
|
259
|
+
if 5.0 <= alcohol <= 25.0: # Reasonable wine alcohol range
|
|
260
|
+
return alcohol
|
|
261
|
+
except ValueError:
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
def _extract_grape_variety(self, text: str) -> str | None:
|
|
267
|
+
"""Extract grape variety from text."""
|
|
268
|
+
text_lower = text.lower()
|
|
269
|
+
|
|
270
|
+
for grape in GRAPE_VARIETIES:
|
|
271
|
+
if grape.lower() in text_lower:
|
|
272
|
+
return grape
|
|
273
|
+
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
def _extract_region(self, text: str) -> str | None:
|
|
277
|
+
"""Extract wine region from text."""
|
|
278
|
+
text_lower = text.lower()
|
|
279
|
+
|
|
280
|
+
for region in WINE_REGIONS:
|
|
281
|
+
if region.lower() in text_lower:
|
|
282
|
+
return region
|
|
283
|
+
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
def _extract_country(self, text: str) -> str | None:
|
|
287
|
+
"""Extract country from text."""
|
|
288
|
+
text_lower = text.lower()
|
|
289
|
+
|
|
290
|
+
# Direct country mentions
|
|
291
|
+
for country in WINE_COUNTRIES:
|
|
292
|
+
if country.lower() in text_lower:
|
|
293
|
+
# Normalize some entries
|
|
294
|
+
if country in ["California", "Oregon", "Washington"]:
|
|
295
|
+
return "United States"
|
|
296
|
+
if country == "USA":
|
|
297
|
+
return "United States"
|
|
298
|
+
return country
|
|
299
|
+
|
|
300
|
+
# Infer from region if possible
|
|
301
|
+
region = self._extract_region(text)
|
|
302
|
+
if region:
|
|
303
|
+
region_to_country = {
|
|
304
|
+
"Bordeaux": "France",
|
|
305
|
+
"Burgundy": "France",
|
|
306
|
+
"Champagne": "France",
|
|
307
|
+
"Tuscany": "Italy",
|
|
308
|
+
"Piedmont": "Italy",
|
|
309
|
+
"Rioja": "Spain",
|
|
310
|
+
"Napa Valley": "United States",
|
|
311
|
+
"Marlborough": "New Zealand",
|
|
312
|
+
"Mendoza": "Argentina",
|
|
313
|
+
"Barossa Valley": "Australia",
|
|
314
|
+
}
|
|
315
|
+
return region_to_country.get(region)
|
|
316
|
+
|
|
317
|
+
return None
|
|
318
|
+
|
|
319
|
+
def _extract_winery(self, text: str) -> str | None:
|
|
320
|
+
"""Extract winery name from text.
|
|
321
|
+
|
|
322
|
+
Typically the winery name appears at the top of the label,
|
|
323
|
+
often in larger text. This is a simple heuristic.
|
|
324
|
+
"""
|
|
325
|
+
lines = text.strip().split("\n")
|
|
326
|
+
|
|
327
|
+
# First non-empty line that's not a year or standard phrase
|
|
328
|
+
for line in lines[:5]: # Check first 5 lines
|
|
329
|
+
line = line.strip()
|
|
330
|
+
if not line:
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
# Skip if it's just a year
|
|
334
|
+
if re.match(r"^\d{4}$", line):
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
# Skip if it's a common label phrase
|
|
338
|
+
skip_phrases = [
|
|
339
|
+
"product of",
|
|
340
|
+
"produced by",
|
|
341
|
+
"bottled by",
|
|
342
|
+
"imported by",
|
|
343
|
+
"contains sulfites",
|
|
344
|
+
"alcohol",
|
|
345
|
+
"estate",
|
|
346
|
+
"reserve",
|
|
347
|
+
]
|
|
348
|
+
if any(phrase in line.lower() for phrase in skip_phrases):
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
# Skip if it looks like alcohol content
|
|
352
|
+
if re.search(r"\d+[.,]?\d*\s*%", line):
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
# If line has reasonable length, might be winery name
|
|
356
|
+
if 3 <= len(line) <= 50:
|
|
357
|
+
return line
|
|
358
|
+
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
def _extract_name(self, text: str, parsed: dict) -> str | None:
|
|
362
|
+
"""Extract wine name from text.
|
|
363
|
+
|
|
364
|
+
Often the wine name includes grape variety, region, or
|
|
365
|
+
is a distinctive name on the label.
|
|
366
|
+
"""
|
|
367
|
+
lines = text.strip().split("\n")
|
|
368
|
+
|
|
369
|
+
# Try to find a distinctive wine name
|
|
370
|
+
candidates = []
|
|
371
|
+
|
|
372
|
+
for line in lines:
|
|
373
|
+
line = line.strip()
|
|
374
|
+
if not line or len(line) < 3:
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# Check if line could be a wine name
|
|
378
|
+
# Skip very short or very long lines
|
|
379
|
+
if len(line) < 3 or len(line) > 60:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
# Skip year-only lines
|
|
383
|
+
if re.match(r"^\d{4}$", line):
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# Skip alcohol percentage lines
|
|
387
|
+
if re.search(r"\d+[.,]?\d*\s*%", line):
|
|
388
|
+
continue
|
|
389
|
+
|
|
390
|
+
candidates.append(line)
|
|
391
|
+
|
|
392
|
+
if candidates:
|
|
393
|
+
# Prefer line with grape variety or vintage if present
|
|
394
|
+
grape = parsed.get("grape_variety", "")
|
|
395
|
+
vintage = parsed.get("vintage")
|
|
396
|
+
|
|
397
|
+
for candidate in candidates:
|
|
398
|
+
if grape and grape.lower() in candidate.lower():
|
|
399
|
+
return candidate
|
|
400
|
+
if vintage and str(vintage) in candidate:
|
|
401
|
+
return candidate
|
|
402
|
+
|
|
403
|
+
# Fall back to first candidate that's not the winery
|
|
404
|
+
winery = parsed.get("winery", "")
|
|
405
|
+
for candidate in candidates:
|
|
406
|
+
if candidate != winery:
|
|
407
|
+
return candidate
|
|
408
|
+
|
|
409
|
+
return candidates[0] if candidates else None
|
|
410
|
+
|
|
411
|
+
return None
|