recipe-clipper 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recipe_clipper/__init__.py +7 -0
- recipe_clipper/cli.py +268 -0
- recipe_clipper/clipper.py +50 -0
- recipe_clipper/exceptions.py +31 -0
- recipe_clipper/formatters.py +125 -0
- recipe_clipper/http.py +61 -0
- recipe_clipper/models.py +44 -0
- recipe_clipper/parsers/__init__.py +1 -0
- recipe_clipper/parsers/llm_parser.py +356 -0
- recipe_clipper/parsers/recipe_scrapers_parser.py +50 -0
- recipe_clipper-0.1.0a0.dist-info/METADATA +337 -0
- recipe_clipper-0.1.0a0.dist-info/RECORD +15 -0
- recipe_clipper-0.1.0a0.dist-info/WHEEL +4 -0
- recipe_clipper-0.1.0a0.dist-info/entry_points.txt +2 -0
- recipe_clipper-0.1.0a0.dist-info/licenses/LICENSE +21 -0
recipe_clipper/cli.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""Command-line interface for recipe clipper."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
|
|
12
|
+
from recipe_clipper.clipper import clip_recipe
|
|
13
|
+
from recipe_clipper.parsers.llm_parser import (
|
|
14
|
+
parse_recipe_from_image,
|
|
15
|
+
parse_recipe_from_document,
|
|
16
|
+
)
|
|
17
|
+
from recipe_clipper.formatters import (
|
|
18
|
+
format_recipe_text,
|
|
19
|
+
format_recipe_json,
|
|
20
|
+
format_recipe_markdown,
|
|
21
|
+
)
|
|
22
|
+
from recipe_clipper.exceptions import RecipeClipperError
|
|
23
|
+
|
|
24
|
+
load_dotenv()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
app = typer.Typer(
|
|
28
|
+
name="recipe-clipper",
|
|
29
|
+
help="Extract recipes from websites with ease",
|
|
30
|
+
add_completion=False,
|
|
31
|
+
)
|
|
32
|
+
console = Console()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OutputFormat(str, Enum):
|
|
36
|
+
"""Supported output formats."""
|
|
37
|
+
|
|
38
|
+
text = "text"
|
|
39
|
+
json = "json"
|
|
40
|
+
markdown = "markdown"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Helper functions for common CLI operations
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _format_recipe(recipe, format: OutputFormat) -> str:
|
|
47
|
+
"""Format recipe based on output format."""
|
|
48
|
+
if format == OutputFormat.json:
|
|
49
|
+
return format_recipe_json(recipe)
|
|
50
|
+
elif format == OutputFormat.markdown:
|
|
51
|
+
return format_recipe_markdown(recipe)
|
|
52
|
+
else:
|
|
53
|
+
return format_recipe_text(recipe)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _write_output(output_text: str, output: Optional[Path]) -> None:
|
|
57
|
+
"""Write output to file or stdout."""
|
|
58
|
+
if output:
|
|
59
|
+
output.write_text(output_text)
|
|
60
|
+
console.print(f"[green]✓[/green] Recipe saved to {output}")
|
|
61
|
+
else:
|
|
62
|
+
console.print(output_text)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _handle_recipe_extraction(
|
|
66
|
+
extract_func, source_description: str, format: OutputFormat, output: Optional[Path]
|
|
67
|
+
) -> None:
|
|
68
|
+
"""Common handler for all recipe extraction commands."""
|
|
69
|
+
try:
|
|
70
|
+
with console.status(f"[bold blue]{source_description}..."):
|
|
71
|
+
recipe = extract_func()
|
|
72
|
+
|
|
73
|
+
output_text = _format_recipe(recipe, format)
|
|
74
|
+
_write_output(output_text, output)
|
|
75
|
+
|
|
76
|
+
except RecipeClipperError as e:
|
|
77
|
+
console.print(f"[red]Error:[/red] {e}", file=sys.stderr)
|
|
78
|
+
raise typer.Exit(code=1)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
console.print(f"[red]Unexpected error:[/red] {e}", file=sys.stderr)
|
|
81
|
+
raise typer.Exit(code=1)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _validate_file_exists(path: Path, file_type: str) -> None:
|
|
85
|
+
"""Validate that a file exists."""
|
|
86
|
+
if not path.exists():
|
|
87
|
+
console.print(f"[red]Error:[/red] {file_type} file not found: {path}", file=sys.stderr)
|
|
88
|
+
raise typer.Exit(code=1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _require_api_key(api_key: Optional[str], operation: str) -> str:
|
|
92
|
+
"""Ensure API key is provided, exit if not."""
|
|
93
|
+
if not api_key:
|
|
94
|
+
console.print(
|
|
95
|
+
f"[red]Error:[/red] API key is required for {operation}. "
|
|
96
|
+
"Set ANTHROPIC_API_KEY environment variable or use --api-key option.",
|
|
97
|
+
file=sys.stderr,
|
|
98
|
+
)
|
|
99
|
+
raise typer.Exit(code=1)
|
|
100
|
+
return api_key
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@app.command()
|
|
104
|
+
def clip_webpage(
|
|
105
|
+
url: str = typer.Argument(..., help="URL of the recipe to extract"),
|
|
106
|
+
format: OutputFormat = typer.Option(
|
|
107
|
+
OutputFormat.text,
|
|
108
|
+
"--format",
|
|
109
|
+
"-f",
|
|
110
|
+
help="Output format (text, json, or markdown)",
|
|
111
|
+
),
|
|
112
|
+
output: Optional[Path] = typer.Option(
|
|
113
|
+
None,
|
|
114
|
+
"--output",
|
|
115
|
+
"-o",
|
|
116
|
+
help="Output file path (prints to stdout if not specified)",
|
|
117
|
+
),
|
|
118
|
+
timeout: int = typer.Option(
|
|
119
|
+
10,
|
|
120
|
+
"--timeout",
|
|
121
|
+
"-t",
|
|
122
|
+
help="HTTP request timeout in seconds",
|
|
123
|
+
),
|
|
124
|
+
api_key: Optional[str] = typer.Option(
|
|
125
|
+
None,
|
|
126
|
+
"--api-key",
|
|
127
|
+
envvar="ANTHROPIC_API_KEY",
|
|
128
|
+
help="Anthropic API key for LLM fallback (can also use ANTHROPIC_API_KEY env var)",
|
|
129
|
+
),
|
|
130
|
+
use_llm_fallback: bool = typer.Option(
|
|
131
|
+
True,
|
|
132
|
+
"--use-llm-fallback/--no-llm-fallback",
|
|
133
|
+
help="Use LLM fallback if recipe-scrapers fails",
|
|
134
|
+
),
|
|
135
|
+
):
|
|
136
|
+
"""
|
|
137
|
+
Extract a recipe from a URL.
|
|
138
|
+
|
|
139
|
+
Examples:
|
|
140
|
+
|
|
141
|
+
recipe-clipper clip-webpage https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/
|
|
142
|
+
|
|
143
|
+
recipe-clipper clip-webpage https://example.com/recipe --format json --output recipe.json
|
|
144
|
+
|
|
145
|
+
recipe-clipper clip-webpage https://example.com/recipe --format markdown --output recipe.md
|
|
146
|
+
|
|
147
|
+
recipe-clipper clip-webpage https://unsupported-site.com/recipe --api-key sk-ant-... --use-llm-fallback
|
|
148
|
+
"""
|
|
149
|
+
_handle_recipe_extraction(
|
|
150
|
+
extract_func=lambda: clip_recipe(
|
|
151
|
+
url, api_key=api_key, use_llm_fallback=use_llm_fallback, timeout=timeout
|
|
152
|
+
),
|
|
153
|
+
source_description=f"Fetching recipe from {url}",
|
|
154
|
+
format=format,
|
|
155
|
+
output=output,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@app.command()
|
|
160
|
+
def clip_image(
|
|
161
|
+
image_path: Path = typer.Argument(..., help="Path to the recipe image file"),
|
|
162
|
+
format: OutputFormat = typer.Option(
|
|
163
|
+
OutputFormat.text,
|
|
164
|
+
"--format",
|
|
165
|
+
"-f",
|
|
166
|
+
help="Output format (text, json, or markdown)",
|
|
167
|
+
),
|
|
168
|
+
output: Optional[Path] = typer.Option(
|
|
169
|
+
None,
|
|
170
|
+
"--output",
|
|
171
|
+
"-o",
|
|
172
|
+
help="Output file path (prints to stdout if not specified)",
|
|
173
|
+
),
|
|
174
|
+
api_key: Optional[str] = typer.Option(
|
|
175
|
+
None,
|
|
176
|
+
"--api-key",
|
|
177
|
+
envvar="ANTHROPIC_API_KEY",
|
|
178
|
+
help="Anthropic API key (required, can also use ANTHROPIC_API_KEY env var)",
|
|
179
|
+
),
|
|
180
|
+
model: str = typer.Option(
|
|
181
|
+
"claude-sonnet-4-5",
|
|
182
|
+
"--model",
|
|
183
|
+
"-m",
|
|
184
|
+
help="Claude model to use",
|
|
185
|
+
),
|
|
186
|
+
):
|
|
187
|
+
"""
|
|
188
|
+
Extract a recipe from an image (e.g., cookbook photo, recipe card).
|
|
189
|
+
|
|
190
|
+
This command uses Claude's vision API to extract recipe text from images.
|
|
191
|
+
Requires an Anthropic API key.
|
|
192
|
+
|
|
193
|
+
Examples:
|
|
194
|
+
|
|
195
|
+
recipe-clipper clip-image recipe.jpg
|
|
196
|
+
|
|
197
|
+
recipe-clipper clip-image cookbook-page.png --format json --output recipe.json
|
|
198
|
+
|
|
199
|
+
recipe-clipper clip-image recipe-card.jpg --api-key sk-ant-... --format markdown
|
|
200
|
+
"""
|
|
201
|
+
api_key = _require_api_key(api_key, "image parsing")
|
|
202
|
+
_validate_file_exists(image_path, "Image")
|
|
203
|
+
|
|
204
|
+
_handle_recipe_extraction(
|
|
205
|
+
extract_func=lambda: parse_recipe_from_image(image_path, api_key, model=model),
|
|
206
|
+
source_description=f"Extracting recipe from {image_path}",
|
|
207
|
+
format=format,
|
|
208
|
+
output=output,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@app.command()
|
|
213
|
+
def clip_document(
|
|
214
|
+
document_path: Path = typer.Argument(..., help="Path to the recipe document file"),
|
|
215
|
+
format: OutputFormat = typer.Option(
|
|
216
|
+
OutputFormat.text,
|
|
217
|
+
"--format",
|
|
218
|
+
"-f",
|
|
219
|
+
help="Output format (text, json, or markdown)",
|
|
220
|
+
),
|
|
221
|
+
output: Optional[Path] = typer.Option(
|
|
222
|
+
None,
|
|
223
|
+
"--output",
|
|
224
|
+
"-o",
|
|
225
|
+
help="Output file path (prints to stdout if not specified)",
|
|
226
|
+
),
|
|
227
|
+
api_key: Optional[str] = typer.Option(
|
|
228
|
+
None,
|
|
229
|
+
"--api-key",
|
|
230
|
+
envvar="ANTHROPIC_API_KEY",
|
|
231
|
+
help="Anthropic API key (required, can also use ANTHROPIC_API_KEY env var)",
|
|
232
|
+
),
|
|
233
|
+
model: str = typer.Option(
|
|
234
|
+
"claude-sonnet-4-5",
|
|
235
|
+
"--model",
|
|
236
|
+
"-m",
|
|
237
|
+
help="Claude model to use",
|
|
238
|
+
),
|
|
239
|
+
):
|
|
240
|
+
"""
|
|
241
|
+
Extract a recipe from a document (PDF, Word, text, markdown).
|
|
242
|
+
|
|
243
|
+
This command uses Claude's document API to extract recipe text from various
|
|
244
|
+
document formats. Requires an Anthropic API key.
|
|
245
|
+
|
|
246
|
+
Supported formats: .pdf, .docx, .txt, .md
|
|
247
|
+
|
|
248
|
+
Examples:
|
|
249
|
+
|
|
250
|
+
recipe-clipper clip-document recipe.pdf
|
|
251
|
+
|
|
252
|
+
recipe-clipper clip-document cookbook.docx --format json --output recipe.json
|
|
253
|
+
|
|
254
|
+
recipe-clipper clip-document recipe.txt --api-key sk-ant-... --format markdown
|
|
255
|
+
"""
|
|
256
|
+
api_key = _require_api_key(api_key, "document parsing")
|
|
257
|
+
_validate_file_exists(document_path, "Document")
|
|
258
|
+
|
|
259
|
+
_handle_recipe_extraction(
|
|
260
|
+
extract_func=lambda: parse_recipe_from_document(document_path, api_key, model=model),
|
|
261
|
+
source_description=f"Extracting recipe from {document_path}",
|
|
262
|
+
format=format,
|
|
263
|
+
output=output,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
if __name__ == "__main__":
|
|
268
|
+
app()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Main clipper orchestration for extracting recipes from URLs."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from recipe_clipper.models import Recipe
|
|
6
|
+
from recipe_clipper.http import fetch_url
|
|
7
|
+
from recipe_clipper.parsers.recipe_scrapers_parser import parse_with_recipe_scrapers
|
|
8
|
+
from recipe_clipper.exceptions import RecipeParsingError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def clip_recipe(
|
|
12
|
+
url: str,
|
|
13
|
+
api_key: Optional[str] = None,
|
|
14
|
+
use_llm_fallback: bool = True,
|
|
15
|
+
timeout: int = 10,
|
|
16
|
+
) -> Recipe:
|
|
17
|
+
"""
|
|
18
|
+
Extract a recipe from a URL.
|
|
19
|
+
|
|
20
|
+
Tries recipe-scrapers library first. If that fails and LLM fallback is enabled,
|
|
21
|
+
falls back to Claude-based extraction.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
url: The URL of the recipe page to extract
|
|
25
|
+
api_key: Anthropic API key for LLM fallback
|
|
26
|
+
use_llm_fallback: Whether to use LLM fallback if recipe-scrapers fails (default: True)
|
|
27
|
+
timeout: HTTP request timeout in seconds (default: 10)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Parsed Recipe object
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError: If use_llm_fallback is True but api_key is not provided
|
|
34
|
+
RecipeNotFoundError: If the recipe cannot be extracted
|
|
35
|
+
NetworkError: If the HTTP request fails
|
|
36
|
+
RecipeParsingError: If parsing fails unexpectedly
|
|
37
|
+
LLMError: If LLM API call fails
|
|
38
|
+
"""
|
|
39
|
+
if use_llm_fallback and not api_key:
|
|
40
|
+
raise ValueError("api_key must be provided when use_llm_fallback is True")
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
response = fetch_url(url, timeout=timeout)
|
|
44
|
+
return parse_with_recipe_scrapers(response)
|
|
45
|
+
except RecipeParsingError:
|
|
46
|
+
if use_llm_fallback:
|
|
47
|
+
from recipe_clipper.parsers.llm_parser import parse_with_claude
|
|
48
|
+
|
|
49
|
+
return parse_with_claude(url, api_key)
|
|
50
|
+
raise
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Custom exceptions for recipe clipper."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RecipeClipperError(Exception):
|
|
5
|
+
"""Base exception for all recipe clipper errors."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RecipeNotFoundError(RecipeClipperError):
|
|
11
|
+
"""Raised when no recipe can be extracted from the URL."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RecipeParsingError(RecipeClipperError):
|
|
17
|
+
"""Raised when recipe parsing fails."""
|
|
18
|
+
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class NetworkError(RecipeClipperError):
|
|
23
|
+
"""Raised when HTTP request fails."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LLMError(RecipeClipperError):
|
|
29
|
+
"""Raised when LLM API call fails."""
|
|
30
|
+
|
|
31
|
+
pass
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Recipe output formatters."""
|
|
2
|
+
|
|
3
|
+
from recipe_clipper.models import Recipe, RecipeMetadata, Ingredient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Helper functions for formatting
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _format_ingredient(ingredient: Ingredient) -> str:
|
|
10
|
+
"""Get ingredient display text."""
|
|
11
|
+
return ingredient.display_text or ingredient.name
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _format_metadata_lines(metadata: RecipeMetadata, style: str = "text") -> list[str]:
|
|
15
|
+
"""Format all metadata fields as lines.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
metadata: Recipe metadata to format
|
|
19
|
+
style: Format style ("text" or "markdown")
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List of formatted metadata lines
|
|
23
|
+
"""
|
|
24
|
+
if not metadata:
|
|
25
|
+
return []
|
|
26
|
+
|
|
27
|
+
lines = []
|
|
28
|
+
field_map = {
|
|
29
|
+
"Author": metadata.author,
|
|
30
|
+
"Servings": metadata.servings,
|
|
31
|
+
"Prep Time": f"{metadata.prep_time} minutes" if metadata.prep_time else None,
|
|
32
|
+
"Cook Time": f"{metadata.cook_time} minutes" if metadata.cook_time else None,
|
|
33
|
+
"Total Time": f"{metadata.total_time} minutes" if metadata.total_time else None,
|
|
34
|
+
"Categories": ", ".join(metadata.categories) if metadata.categories else None,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
for key, value in field_map.items():
|
|
38
|
+
if value:
|
|
39
|
+
if style == "markdown":
|
|
40
|
+
lines.append(f"- **{key}:** {value}")
|
|
41
|
+
else:
|
|
42
|
+
lines.append(f"{key}: {value}")
|
|
43
|
+
|
|
44
|
+
return lines
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def format_recipe_text(recipe: Recipe) -> str:
|
|
48
|
+
"""Format recipe as human-readable text."""
|
|
49
|
+
lines = []
|
|
50
|
+
|
|
51
|
+
# Title
|
|
52
|
+
lines.append(f"\n{'=' * 80}")
|
|
53
|
+
lines.append(recipe.title.center(80))
|
|
54
|
+
lines.append("=" * 80)
|
|
55
|
+
|
|
56
|
+
# Metadata
|
|
57
|
+
metadata_lines = _format_metadata_lines(recipe.metadata, style="text")
|
|
58
|
+
if metadata_lines:
|
|
59
|
+
lines.append("\nMETADATA")
|
|
60
|
+
lines.append("-" * 80)
|
|
61
|
+
lines.extend(metadata_lines)
|
|
62
|
+
|
|
63
|
+
# Ingredients
|
|
64
|
+
lines.append("\nINGREDIENTS")
|
|
65
|
+
lines.append("-" * 80)
|
|
66
|
+
for ingredient in recipe.ingredients:
|
|
67
|
+
lines.append(f" • {_format_ingredient(ingredient)}")
|
|
68
|
+
|
|
69
|
+
# Instructions
|
|
70
|
+
lines.append("\nINSTRUCTIONS")
|
|
71
|
+
lines.append("-" * 80)
|
|
72
|
+
for i, instruction in enumerate(recipe.instructions, 1):
|
|
73
|
+
lines.append(f"{i}. {instruction}")
|
|
74
|
+
|
|
75
|
+
# Source
|
|
76
|
+
if recipe.source_url:
|
|
77
|
+
lines.append(f"\nSource: {recipe.source_url}")
|
|
78
|
+
|
|
79
|
+
lines.append("")
|
|
80
|
+
return "\n".join(lines)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def format_recipe_json(recipe: Recipe) -> str:
|
|
84
|
+
"""Format recipe as JSON."""
|
|
85
|
+
return recipe.model_dump_json(indent=2, exclude_none=True)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def format_recipe_markdown(recipe: Recipe) -> str:
|
|
89
|
+
"""Format recipe as Markdown."""
|
|
90
|
+
lines = []
|
|
91
|
+
|
|
92
|
+
# Title (big heading)
|
|
93
|
+
lines.append(f"# {recipe.title}")
|
|
94
|
+
lines.append("")
|
|
95
|
+
|
|
96
|
+
# Metadata
|
|
97
|
+
metadata_lines = _format_metadata_lines(recipe.metadata, style="markdown")
|
|
98
|
+
if metadata_lines:
|
|
99
|
+
lines.append("## Metadata")
|
|
100
|
+
lines.append("")
|
|
101
|
+
lines.extend(metadata_lines)
|
|
102
|
+
lines.append("")
|
|
103
|
+
|
|
104
|
+
# Ingredients
|
|
105
|
+
lines.append("## Ingredients")
|
|
106
|
+
lines.append("")
|
|
107
|
+
for ingredient in recipe.ingredients:
|
|
108
|
+
lines.append(f"- {_format_ingredient(ingredient)}")
|
|
109
|
+
lines.append("")
|
|
110
|
+
|
|
111
|
+
# Instructions
|
|
112
|
+
lines.append("## Instructions")
|
|
113
|
+
lines.append("")
|
|
114
|
+
for i, instruction in enumerate(recipe.instructions, 1):
|
|
115
|
+
lines.append(f"{i}. {instruction}")
|
|
116
|
+
lines.append("")
|
|
117
|
+
|
|
118
|
+
# Source
|
|
119
|
+
if recipe.source_url:
|
|
120
|
+
lines.append("## Source")
|
|
121
|
+
lines.append("")
|
|
122
|
+
lines.append(f"[{recipe.source_url}]({recipe.source_url})")
|
|
123
|
+
lines.append("")
|
|
124
|
+
|
|
125
|
+
return "\n".join(lines)
|
recipe_clipper/http.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""HTTP client for fetching recipe pages."""
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
|
|
6
|
+
from recipe_clipper.exceptions import NetworkError
|
|
7
|
+
from recipe_clipper.models import ImmutableBaseModel
|
|
8
|
+
|
|
9
|
+
DEFAULT_USER_AGENT = "recipe-clipper/0.1.0 (https://github.com/recipe-clipper/recipe-clipper)"
|
|
10
|
+
DEFAULT_TIMEOUT = 10
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HttpResponse(ImmutableBaseModel):
|
|
14
|
+
"""HTTP response data."""
|
|
15
|
+
|
|
16
|
+
content: str
|
|
17
|
+
status_code: int
|
|
18
|
+
url: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def fetch_url(
|
|
22
|
+
url: str,
|
|
23
|
+
timeout: int = DEFAULT_TIMEOUT,
|
|
24
|
+
headers: Optional[Dict[str, str]] = None,
|
|
25
|
+
) -> HttpResponse:
|
|
26
|
+
"""
|
|
27
|
+
Fetch HTML content from a URL.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
url: The URL to fetch
|
|
31
|
+
timeout: Request timeout in seconds
|
|
32
|
+
headers: Optional custom headers
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
HttpResponse with content, status_code, and final URL (after redirects)
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
NetworkError: If the request fails
|
|
39
|
+
"""
|
|
40
|
+
if headers is None:
|
|
41
|
+
headers = {}
|
|
42
|
+
|
|
43
|
+
# Set default user agent if not provided
|
|
44
|
+
if "User-Agent" not in headers:
|
|
45
|
+
headers["User-Agent"] = DEFAULT_USER_AGENT
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
with httpx.Client(timeout=timeout) as client:
|
|
49
|
+
response = client.get(url, headers=headers, follow_redirects=True)
|
|
50
|
+
response.raise_for_status()
|
|
51
|
+
return HttpResponse(
|
|
52
|
+
content=response.text,
|
|
53
|
+
status_code=response.status_code,
|
|
54
|
+
url=str(response.url),
|
|
55
|
+
)
|
|
56
|
+
except httpx.HTTPStatusError as e:
|
|
57
|
+
raise NetworkError(f"HTTP error {e.response.status_code} while fetching {url}") from e
|
|
58
|
+
except httpx.RequestError as e:
|
|
59
|
+
raise NetworkError(f"Network error while fetching {url}: {e}") from e
|
|
60
|
+
except Exception as e:
|
|
61
|
+
raise NetworkError(f"Unexpected error while fetching {url}: {e}") from e
|
recipe_clipper/models.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Data models for recipe clipper."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from pydantic import BaseModel, Field, HttpUrl, AnyUrl, ConfigDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ImmutableBaseModel(BaseModel):
|
|
8
|
+
"""Base model with immutability enabled."""
|
|
9
|
+
|
|
10
|
+
model_config = ConfigDict(frozen=True)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Ingredient(ImmutableBaseModel):
|
|
14
|
+
"""An ingredient in a recipe."""
|
|
15
|
+
|
|
16
|
+
name: str = Field(..., description="Ingredient name")
|
|
17
|
+
amount: Optional[str] = Field(None, description="Quantity (e.g., '2', '1/2')")
|
|
18
|
+
unit: Optional[str] = Field(None, description="Unit of measurement (e.g., 'cup', 'tsp')")
|
|
19
|
+
preparation: Optional[str] = Field(
|
|
20
|
+
None, description="Preparation method (e.g., 'chopped', 'diced', 'minced')"
|
|
21
|
+
)
|
|
22
|
+
display_text: Optional[str] = Field(None, description="How the ingredient should be displayed")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RecipeMetadata(ImmutableBaseModel):
|
|
26
|
+
"""Metadata about a recipe."""
|
|
27
|
+
|
|
28
|
+
author: Optional[str] = Field(None, description="Recipe author or source")
|
|
29
|
+
servings: Optional[str] = Field(None, description="Number of servings")
|
|
30
|
+
prep_time: Optional[int] = Field(None, description="Prep time in minutes")
|
|
31
|
+
cook_time: Optional[int] = Field(None, description="Cook time in minutes")
|
|
32
|
+
total_time: Optional[int] = Field(None, description="Total time in minutes")
|
|
33
|
+
categories: Optional[list[str]] = Field(None, description="Recipe categories")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Recipe(ImmutableBaseModel):
|
|
37
|
+
"""A complete recipe."""
|
|
38
|
+
|
|
39
|
+
title: str = Field(..., description="Recipe title")
|
|
40
|
+
ingredients: list[Ingredient] = Field(default_factory=list, description="List of ingredients")
|
|
41
|
+
instructions: list[str] = Field(default_factory=list, description="Step-by-step instructions")
|
|
42
|
+
source_url: Optional[AnyUrl] = Field(None, description="Source URL (http/https/file)")
|
|
43
|
+
image: Optional[HttpUrl] = Field(None, description="Recipe image URL")
|
|
44
|
+
metadata: Optional[RecipeMetadata] = Field(None, description="Recipe metadata")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Recipe parsers."""
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""LLM-based recipe parser using Claude API."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
6
|
+
|
|
7
|
+
from pydantic import AnyUrl
|
|
8
|
+
|
|
9
|
+
from recipe_clipper.models import Recipe
|
|
10
|
+
from recipe_clipper.exceptions import LLMError
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from anthropic import Anthropic
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SUPPORTED_MODELS = {
|
|
17
|
+
"claude-sonnet-4-5",
|
|
18
|
+
"claude-sonnet-4",
|
|
19
|
+
"claude-opus-4",
|
|
20
|
+
"claude-3-5-sonnet-20241022",
|
|
21
|
+
"claude-3-5-sonnet-20240620",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
IMAGE_MEDIA_TYPES = {
|
|
25
|
+
".jpg": "image/jpeg",
|
|
26
|
+
".jpeg": "image/jpeg",
|
|
27
|
+
".png": "image/png",
|
|
28
|
+
".gif": "image/gif",
|
|
29
|
+
".webp": "image/webp",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
DOCUMENT_MEDIA_TYPES = {
|
|
33
|
+
".pdf": "application/pdf",
|
|
34
|
+
".txt": "text/plain",
|
|
35
|
+
".md": "text/markdown",
|
|
36
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Helper functions for common LLM parsing operations
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _validate_model(model: str) -> None:
|
|
44
|
+
"""Validate that the model is supported."""
|
|
45
|
+
if model not in SUPPORTED_MODELS:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Unsupported model: {model}. Supported models: {', '.join(sorted(SUPPORTED_MODELS))}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _validate_file_path(file_path: Path, file_description: str) -> None:
|
|
52
|
+
"""Validate that a file exists."""
|
|
53
|
+
if not file_path.exists():
|
|
54
|
+
raise FileNotFoundError(f"{file_description} not found: {file_path}")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _read_and_encode_file(file_path: Path) -> str:
|
|
58
|
+
"""Read a file and return base64-encoded data."""
|
|
59
|
+
with open(file_path, "rb") as f:
|
|
60
|
+
return base64.standard_b64encode(f.read()).decode("utf-8")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _get_recipe_extraction_prompt(source_type: str = "document") -> str:
|
|
64
|
+
"""Get the standard recipe extraction prompt.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
source_type: Type of source ("image", "document", or "webpage")
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Formatted prompt string
|
|
71
|
+
"""
|
|
72
|
+
if source_type == "image":
|
|
73
|
+
visibility_note = (
|
|
74
|
+
"If any information is not visible in the image, omit it from the output.\n"
|
|
75
|
+
"Extract the text exactly as it appears, preserving the original wording and formatting."
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
visibility_note = (
|
|
79
|
+
"If any information is not present in the document, omit it from the output.\n"
|
|
80
|
+
"Extract the text exactly as it appears, preserving the original wording."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return f"""Extract the recipe from this {source_type} into a structured output with the following elements:
|
|
84
|
+
- title
|
|
85
|
+
- ingredients
|
|
86
|
+
- amount
|
|
87
|
+
- units
|
|
88
|
+
- preparation method, if available
|
|
89
|
+
- original wording as the display_text
|
|
90
|
+
- instructions
|
|
91
|
+
- metadata
|
|
92
|
+
- author (if visible)
|
|
93
|
+
- number of servings (if visible)
|
|
94
|
+
- prep time (if visible)
|
|
95
|
+
- cook time (if visible)
|
|
96
|
+
- total time (if visible)
|
|
97
|
+
- categories (if visible)
|
|
98
|
+
|
|
99
|
+
{visibility_note}
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _call_claude_api(
|
|
104
|
+
client: "Anthropic",
|
|
105
|
+
model: str,
|
|
106
|
+
messages: list,
|
|
107
|
+
betas: list[str],
|
|
108
|
+
source_description: str,
|
|
109
|
+
tools: Optional[list] = None,
|
|
110
|
+
) -> Recipe:
|
|
111
|
+
"""Make a Claude API call with error handling.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
client: Anthropic client instance
|
|
115
|
+
model: Model name to use
|
|
116
|
+
messages: Messages to send
|
|
117
|
+
betas: Beta features to enable
|
|
118
|
+
source_description: Description of source for error messages
|
|
119
|
+
tools: Optional tools to provide
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Parsed Recipe object
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
LLMError: If API call fails
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
kwargs = {
|
|
129
|
+
"model": model,
|
|
130
|
+
"max_tokens": 4096,
|
|
131
|
+
"messages": messages,
|
|
132
|
+
"output_format": Recipe,
|
|
133
|
+
"betas": betas,
|
|
134
|
+
}
|
|
135
|
+
if tools:
|
|
136
|
+
kwargs["tools"] = tools
|
|
137
|
+
|
|
138
|
+
message = client.beta.messages.parse(**kwargs)
|
|
139
|
+
return message.parsed_output
|
|
140
|
+
except Exception as error:
|
|
141
|
+
raise LLMError(f"Claude API call failed for {source_description}: {error}") from error
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _set_recipe_source_url(recipe: Recipe, source_path: Union[str, Path]) -> Recipe:
|
|
145
|
+
"""Set the source URL for a recipe.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
recipe: Recipe object to update
|
|
149
|
+
source_path: Source path (URL string or file path)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Updated Recipe object with source_url set
|
|
153
|
+
"""
|
|
154
|
+
if isinstance(source_path, str) and source_path.startswith("http"):
|
|
155
|
+
source_url = AnyUrl(source_path)
|
|
156
|
+
else:
|
|
157
|
+
source_url = AnyUrl(Path(source_path).absolute().as_uri())
|
|
158
|
+
return recipe.model_copy(update={"source_url": source_url})
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _validate_file_format(
|
|
162
|
+
file_path: Path, media_type_map: dict[str, str], format_category: str
|
|
163
|
+
) -> str:
|
|
164
|
+
"""Validate file format and return media type.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
file_path: Path to the file
|
|
168
|
+
media_type_map: Mapping of file extensions to media types
|
|
169
|
+
format_category: Category description for error message
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Media type string
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
ValueError: If file format is not supported
|
|
176
|
+
"""
|
|
177
|
+
extension = file_path.suffix.lower()
|
|
178
|
+
|
|
179
|
+
if extension not in media_type_map:
|
|
180
|
+
raise ValueError(
|
|
181
|
+
f"Unsupported {format_category} format: {extension}. "
|
|
182
|
+
f"Supported formats: {', '.join(media_type_map.keys())}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return media_type_map[extension]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def parse_with_claude(url: str, api_key: str, model: str = "claude-sonnet-4-5") -> Recipe:
|
|
189
|
+
"""Parse a recipe using Claude's structured output API.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
url: URL of the recipe page
|
|
193
|
+
api_key: Anthropic API key
|
|
194
|
+
model: Claude model to use (default: claude-sonnet-4-5)
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Parsed Recipe object
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
ValueError: If an unsupported model is specified
|
|
201
|
+
LLMError: If Claude API call fails
|
|
202
|
+
"""
|
|
203
|
+
from anthropic import Anthropic
|
|
204
|
+
|
|
205
|
+
_validate_model(model)
|
|
206
|
+
|
|
207
|
+
client = Anthropic(api_key=api_key)
|
|
208
|
+
|
|
209
|
+
prompt = f"{_get_recipe_extraction_prompt('webpage')}\n\nURL:\n\n{url}"
|
|
210
|
+
messages = [{"role": "user", "content": prompt}]
|
|
211
|
+
|
|
212
|
+
tools = [
|
|
213
|
+
{
|
|
214
|
+
"type": "web_fetch_20250910",
|
|
215
|
+
"name": "web_fetch",
|
|
216
|
+
"max_uses": 1,
|
|
217
|
+
}
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
recipe = _call_claude_api(
|
|
221
|
+
client=client,
|
|
222
|
+
model=model,
|
|
223
|
+
messages=messages,
|
|
224
|
+
betas=["structured-outputs-2025-11-13", "web-fetch-2025-09-10"],
|
|
225
|
+
source_description=url,
|
|
226
|
+
tools=tools,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
return _set_recipe_source_url(recipe, url)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def parse_recipe_from_image(
|
|
233
|
+
image_path: Union[str, Path], api_key: str, model: str = "claude-sonnet-4-5"
|
|
234
|
+
) -> Recipe:
|
|
235
|
+
"""Parse a recipe from an image using Claude's vision API.
|
|
236
|
+
|
|
237
|
+
Useful for extracting recipes from cookbook photos, handwritten recipe cards,
|
|
238
|
+
or screenshots of recipes.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
image_path: Path to the image file (jpg, png, gif, webp)
|
|
242
|
+
api_key: Anthropic API key
|
|
243
|
+
model: Claude model to use (default: claude-sonnet-4-5)
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Parsed Recipe object
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
ValueError: If an unsupported model is specified or image format is invalid
|
|
250
|
+
FileNotFoundError: If the image file doesn't exist
|
|
251
|
+
LLMError: If Claude API call fails
|
|
252
|
+
"""
|
|
253
|
+
from anthropic import Anthropic
|
|
254
|
+
|
|
255
|
+
_validate_model(model)
|
|
256
|
+
|
|
257
|
+
image_file = Path(image_path)
|
|
258
|
+
_validate_file_path(image_file, "Image file")
|
|
259
|
+
|
|
260
|
+
media_type = _validate_file_format(image_file, IMAGE_MEDIA_TYPES, "image")
|
|
261
|
+
image_data = _read_and_encode_file(image_file)
|
|
262
|
+
|
|
263
|
+
client = Anthropic(api_key=api_key)
|
|
264
|
+
|
|
265
|
+
messages = [
|
|
266
|
+
{
|
|
267
|
+
"role": "user",
|
|
268
|
+
"content": [
|
|
269
|
+
{
|
|
270
|
+
"type": "image",
|
|
271
|
+
"source": {
|
|
272
|
+
"type": "base64",
|
|
273
|
+
"media_type": media_type,
|
|
274
|
+
"data": image_data,
|
|
275
|
+
},
|
|
276
|
+
},
|
|
277
|
+
{"type": "text", "text": _get_recipe_extraction_prompt("image")},
|
|
278
|
+
],
|
|
279
|
+
}
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
recipe = _call_claude_api(
|
|
283
|
+
client=client,
|
|
284
|
+
model=model,
|
|
285
|
+
messages=messages,
|
|
286
|
+
betas=["structured-outputs-2025-11-13"],
|
|
287
|
+
source_description=f"image {image_path}",
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return _set_recipe_source_url(recipe, image_file)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def parse_recipe_from_document(
|
|
294
|
+
document_path: Union[str, Path], api_key: str, model: str = "claude-sonnet-4-5"
|
|
295
|
+
) -> Recipe:
|
|
296
|
+
"""Parse a recipe from a document using Claude's file API.
|
|
297
|
+
|
|
298
|
+
Supports PDF, Word documents (.docx), and text formats (.txt, .md).
|
|
299
|
+
Uses Claude's native document handling capabilities.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
document_path: Path to the document file (pdf, docx, txt, md)
|
|
303
|
+
api_key: Anthropic API key
|
|
304
|
+
model: Claude model to use (default: claude-sonnet-4-5)
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Parsed Recipe object
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
ValueError: If an unsupported model or document format is specified
|
|
311
|
+
FileNotFoundError: If the document file doesn't exist
|
|
312
|
+
LLMError: If Claude API call fails
|
|
313
|
+
"""
|
|
314
|
+
from anthropic import Anthropic
|
|
315
|
+
|
|
316
|
+
_validate_model(model)
|
|
317
|
+
|
|
318
|
+
doc_file = Path(document_path)
|
|
319
|
+
_validate_file_path(doc_file, "Document file")
|
|
320
|
+
|
|
321
|
+
media_type = _validate_file_format(doc_file, DOCUMENT_MEDIA_TYPES, "document")
|
|
322
|
+
document_data = _read_and_encode_file(doc_file)
|
|
323
|
+
|
|
324
|
+
client = Anthropic(api_key=api_key)
|
|
325
|
+
|
|
326
|
+
messages = [
|
|
327
|
+
{
|
|
328
|
+
"role": "user",
|
|
329
|
+
"content": [
|
|
330
|
+
{
|
|
331
|
+
"type": "document",
|
|
332
|
+
"source": {
|
|
333
|
+
"type": "base64",
|
|
334
|
+
"media_type": media_type,
|
|
335
|
+
"data": document_data,
|
|
336
|
+
},
|
|
337
|
+
},
|
|
338
|
+
{"type": "text", "text": _get_recipe_extraction_prompt("document")},
|
|
339
|
+
],
|
|
340
|
+
}
|
|
341
|
+
]
|
|
342
|
+
|
|
343
|
+
# Add PDF beta if needed
|
|
344
|
+
betas = ["structured-outputs-2025-11-13"]
|
|
345
|
+
if doc_file.suffix.lower() == ".pdf":
|
|
346
|
+
betas.append("pdfs-2024-09-25")
|
|
347
|
+
|
|
348
|
+
recipe = _call_claude_api(
|
|
349
|
+
client=client,
|
|
350
|
+
model=model,
|
|
351
|
+
messages=messages,
|
|
352
|
+
betas=betas,
|
|
353
|
+
source_description=f"document {document_path}",
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
return _set_recipe_source_url(recipe, doc_file)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Parser using the recipe-scrapers library."""
|
|
2
|
+
|
|
3
|
+
from recipe_scrapers import scrape_html
|
|
4
|
+
|
|
5
|
+
from recipe_clipper.models import Recipe, Ingredient, RecipeMetadata
|
|
6
|
+
from recipe_clipper.http import HttpResponse
|
|
7
|
+
from recipe_clipper.exceptions import RecipeParsingError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_with_recipe_scrapers(response: HttpResponse) -> Recipe:
|
|
11
|
+
"""
|
|
12
|
+
Parse a recipe using the recipe-scrapers library.
|
|
13
|
+
|
|
14
|
+
Attempts to parse recipes from both supported and unsupported sites using
|
|
15
|
+
generic schema.org markup when site-specific scrapers are unavailable.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
response: HttpResponse containing URL and HTML content
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Parsed Recipe object
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
RecipeNotFoundError: If no recipe could be found in the page
|
|
25
|
+
RecipeParsingError: If parsing fails
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
scraper = scrape_html(response.content, response.url, supported_only=False)
|
|
29
|
+
except Exception as error:
|
|
30
|
+
raise RecipeParsingError(f"Failed to create scraper for {response.url}: {error}") from error
|
|
31
|
+
|
|
32
|
+
ingredients = [Ingredient(name=ingredient) for ingredient in scraper.ingredients()]
|
|
33
|
+
|
|
34
|
+
metadata = RecipeMetadata(
|
|
35
|
+
author=scraper.author(),
|
|
36
|
+
servings=scraper.yields(),
|
|
37
|
+
prep_time=scraper.prep_time(),
|
|
38
|
+
cook_time=scraper.cook_time(),
|
|
39
|
+
total_time=scraper.total_time(),
|
|
40
|
+
categories=[scraper.category()] if scraper.category() else None,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return Recipe(
|
|
44
|
+
title=scraper.title(),
|
|
45
|
+
ingredients=ingredients,
|
|
46
|
+
instructions=scraper.instructions_list(),
|
|
47
|
+
source_url=response.url,
|
|
48
|
+
image=scraper.image(),
|
|
49
|
+
metadata=metadata,
|
|
50
|
+
)
|
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: recipe-clipper
|
|
3
|
+
Version: 0.1.0a0
|
|
4
|
+
Summary: Extract recipes from websites, images, and documents.
|
|
5
|
+
Project-URL: Homepage, https://github.com/zduey/recipe-clipper
|
|
6
|
+
Project-URL: Repository, https://github.com/zduey/recipe-clipper
|
|
7
|
+
Project-URL: Issues, https://github.com/zduey/recipe-clipper/issues
|
|
8
|
+
Author: Recipe Clipper Contributors
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: cooking,food,llm,ocr,parser,pdf,recipe,scraping,vision
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Requires-Dist: httpx>=0.27.0
|
|
28
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
30
|
+
Requires-Dist: recipe-scrapers>=15.0.0
|
|
31
|
+
Requires-Dist: rich>=13.0.0
|
|
32
|
+
Requires-Dist: typer>=0.12.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: respx>=0.21.0; extra == 'dev'
|
|
38
|
+
Provides-Extra: llm
|
|
39
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'llm'
|
|
40
|
+
Requires-Dist: cohere>=5.0.0; extra == 'llm'
|
|
41
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'llm'
|
|
42
|
+
Requires-Dist: openai>=1.0.0; extra == 'llm'
|
|
43
|
+
Provides-Extra: llm-anthropic
|
|
44
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'llm-anthropic'
|
|
45
|
+
Provides-Extra: llm-cohere
|
|
46
|
+
Requires-Dist: cohere>=5.0.0; extra == 'llm-cohere'
|
|
47
|
+
Provides-Extra: llm-google
|
|
48
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == 'llm-google'
|
|
49
|
+
Provides-Extra: llm-openai
|
|
50
|
+
Requires-Dist: openai>=1.0.0; extra == 'llm-openai'
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# Recipe Clipper
|
|
54
|
+
|
|
55
|
+
[](https://www.python.org/downloads/)
|
|
56
|
+
[](https://codecov.io/gh/zduey/recipe-clipper)
|
|
57
|
+
[](https://opensource.org/licenses/MIT)
|
|
58
|
+
[](https://pypi.org/project/recipe-clipper/)
|
|
59
|
+
|
|
60
|
+
Extract recipes from websites, images, and documents with ease. Recipe Clipper supports multiple input sources and uses both web scraping and Claude's vision capabilities to extract structured recipe data.
|
|
61
|
+
|
|
62
|
+
## Features
|
|
63
|
+
|
|
64
|
+
- 🌐 **Web Scraping**: Extract recipes from 100+ websites using [recipe-scrapers](https://github.com/hhursev/recipe-scrapers)
|
|
65
|
+
- 📸 **Image OCR**: Extract recipes from cookbook photos, recipe cards, or screenshots using Claude's vision API
|
|
66
|
+
- 📄 **Document Parsing**: Extract recipes from PDFs, Word documents, text files, and markdown
|
|
67
|
+
- 🤖 **LLM Fallback**: Automatically falls back to Claude for unsupported websites
|
|
68
|
+
- 🎨 **Multiple Output Formats**: Export as text, JSON, or markdown
|
|
69
|
+
- 🔧 **CLI & Library**: Use as a command-line tool or import as a Python library
|
|
70
|
+
- ⚡ **Type-Safe**: Full type hints with Pydantic models
|
|
71
|
+
- 🔒 **Immutable**: Data models are frozen for safety
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
### Basic Installation
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install recipe-clipper
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
This includes:
|
|
82
|
+
- Web scraping for 100+ recipe websites
|
|
83
|
+
- CLI tool
|
|
84
|
+
- All core functionality
|
|
85
|
+
|
|
86
|
+
### With Claude Support
|
|
87
|
+
|
|
88
|
+
For image/document parsing and LLM fallback:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install recipe-clipper[llm]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Quick Start
|
|
95
|
+
|
|
96
|
+
### CLI Usage
|
|
97
|
+
|
|
98
|
+
#### Extract from a website
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Basic usage
|
|
102
|
+
recipe-clipper clip-webpage https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/
|
|
103
|
+
|
|
104
|
+
# Save as JSON
|
|
105
|
+
recipe-clipper clip-webpage https://example.com/recipe --format json --output recipe.json
|
|
106
|
+
|
|
107
|
+
# Save as markdown
|
|
108
|
+
recipe-clipper clip-webpage https://example.com/recipe --format markdown --output recipe.md
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
#### Extract from an image
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Requires ANTHROPIC_API_KEY environment variable
|
|
115
|
+
export ANTHROPIC_API_KEY=your-api-key
|
|
116
|
+
|
|
117
|
+
recipe-clipper clip-image cookbook-photo.jpg
|
|
118
|
+
|
|
119
|
+
# Save as JSON
|
|
120
|
+
recipe-clipper clip-image recipe-card.png --format json --output recipe.json
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
#### Extract from a document
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Supports PDF, DOCX, TXT, MD
|
|
127
|
+
recipe-clipper clip-document recipe.pdf
|
|
128
|
+
|
|
129
|
+
# With custom model
|
|
130
|
+
recipe-clipper clip-document cookbook.docx --model claude-opus-4 --format markdown
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Library Usage
|
|
134
|
+
|
|
135
|
+
#### Extract from a website
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from recipe_clipper import clip_recipe
|
|
139
|
+
|
|
140
|
+
# Without LLM fallback (uses recipe-scrapers only)
|
|
141
|
+
recipe = clip_recipe(
|
|
142
|
+
url="https://www.allrecipes.com/recipe/10813/best-chocolate-chip-cookies/",
|
|
143
|
+
api_key=None,
|
|
144
|
+
use_llm_fallback=False
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
print(recipe.title)
|
|
148
|
+
for ingredient in recipe.ingredients:
|
|
149
|
+
print(f"- {ingredient.name}")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
#### With LLM fallback
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
import os
|
|
156
|
+
from recipe_clipper import clip_recipe
|
|
157
|
+
|
|
158
|
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
159
|
+
|
|
160
|
+
# Automatically falls back to Claude if recipe-scrapers doesn't support the site
|
|
161
|
+
recipe = clip_recipe(
|
|
162
|
+
url="https://unsupported-site.com/recipe",
|
|
163
|
+
api_key=api_key,
|
|
164
|
+
use_llm_fallback=True
|
|
165
|
+
)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
#### Extract from an image
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import os
|
|
172
|
+
from recipe_clipper.parsers.llm_parser import parse_recipe_from_image
|
|
173
|
+
|
|
174
|
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
175
|
+
|
|
176
|
+
recipe = parse_recipe_from_image(
|
|
177
|
+
image_path="cookbook-photo.jpg",
|
|
178
|
+
api_key=api_key,
|
|
179
|
+
model="claude-sonnet-4-5"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
print(recipe.title)
|
|
183
|
+
print(f"Servings: {recipe.metadata.servings}")
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
#### Extract from a document
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
import os
|
|
190
|
+
from recipe_clipper.parsers.llm_parser import parse_recipe_from_document
|
|
191
|
+
|
|
192
|
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
193
|
+
|
|
194
|
+
# Supports .pdf, .docx, .txt, .md
|
|
195
|
+
recipe = parse_recipe_from_document(
|
|
196
|
+
document_path="recipe.pdf",
|
|
197
|
+
api_key=api_key
|
|
198
|
+
)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
#### Format output
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from recipe_clipper import clip_recipe
|
|
205
|
+
from recipe_clipper.formatters import (
|
|
206
|
+
format_recipe_text,
|
|
207
|
+
format_recipe_json,
|
|
208
|
+
format_recipe_markdown
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
recipe = clip_recipe("https://example.com/recipe", use_llm_fallback=False)
|
|
212
|
+
|
|
213
|
+
# Plain text
|
|
214
|
+
print(format_recipe_text(recipe))
|
|
215
|
+
|
|
216
|
+
# JSON
|
|
217
|
+
json_str = format_recipe_json(recipe)
|
|
218
|
+
|
|
219
|
+
# Markdown
|
|
220
|
+
markdown_str = format_recipe_markdown(recipe)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Configuration
|
|
224
|
+
|
|
225
|
+
### API Keys
|
|
226
|
+
|
|
227
|
+
For Claude features (image/document parsing, website fallback), set your API key:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
export ANTHROPIC_API_KEY=your-api-key-here
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Or create a `.env` file:
|
|
234
|
+
|
|
235
|
+
```env
|
|
236
|
+
ANTHROPIC_API_KEY=your-api-key-here
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Supported Models
|
|
240
|
+
|
|
241
|
+
- `claude-sonnet-4-5` (default, recommended)
|
|
242
|
+
- `claude-sonnet-4`
|
|
243
|
+
- `claude-opus-4`
|
|
244
|
+
- `claude-3-5-sonnet-20241022`
|
|
245
|
+
- `claude-3-5-sonnet-20240620`
|
|
246
|
+
|
|
247
|
+
## Recipe Data Model
|
|
248
|
+
|
|
249
|
+
Extracted recipes use a structured Pydantic model:
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
class Recipe:
|
|
253
|
+
title: str
|
|
254
|
+
ingredients: list[Ingredient]
|
|
255
|
+
instructions: list[str]
|
|
256
|
+
source_url: Optional[AnyUrl]
|
|
257
|
+
image: Optional[HttpUrl]
|
|
258
|
+
metadata: Optional[RecipeMetadata]
|
|
259
|
+
|
|
260
|
+
class Ingredient:
|
|
261
|
+
name: str
|
|
262
|
+
amount: Optional[str]
|
|
263
|
+
unit: Optional[str]
|
|
264
|
+
preparation: Optional[str]
|
|
265
|
+
display_text: Optional[str]
|
|
266
|
+
|
|
267
|
+
class RecipeMetadata:
|
|
268
|
+
author: Optional[str]
|
|
269
|
+
servings: Optional[str]
|
|
270
|
+
prep_time: Optional[int] # minutes
|
|
271
|
+
cook_time: Optional[int] # minutes
|
|
272
|
+
total_time: Optional[int] # minutes
|
|
273
|
+
categories: Optional[list[str]]
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## Supported Input Sources
|
|
277
|
+
|
|
278
|
+
### 1. Websites (100+ sites)
|
|
279
|
+
|
|
280
|
+
Uses [recipe-scrapers](https://github.com/hhursev/recipe-scrapers) which supports:
|
|
281
|
+
- AllRecipes
|
|
282
|
+
- Food Network
|
|
283
|
+
- Serious Eats
|
|
284
|
+
- NYT Cooking
|
|
285
|
+
- And 100+ more sites
|
|
286
|
+
|
|
287
|
+
For unsupported sites, enable LLM fallback.
|
|
288
|
+
|
|
289
|
+
### 2. Images
|
|
290
|
+
|
|
291
|
+
Extracts recipes from:
|
|
292
|
+
- Cookbook photos
|
|
293
|
+
- Handwritten recipe cards
|
|
294
|
+
- Screenshots
|
|
295
|
+
- Scanned documents
|
|
296
|
+
|
|
297
|
+
Supported formats: `.jpg`, `.jpeg`, `.png`, `.gif`, `.webp`
|
|
298
|
+
|
|
299
|
+
### 3. Documents
|
|
300
|
+
|
|
301
|
+
Extracts recipes from:
|
|
302
|
+
- PDFs (recipe PDFs, cookbook PDFs)
|
|
303
|
+
- Word documents (`.docx`)
|
|
304
|
+
- Text files (`.txt`)
|
|
305
|
+
- Markdown files (`.md`)
|
|
306
|
+
|
|
307
|
+
## Development
|
|
308
|
+
|
|
309
|
+
### Run tests
|
|
310
|
+
|
|
311
|
+
```bash
|
|
312
|
+
# Unit tests only
|
|
313
|
+
pytest
|
|
314
|
+
|
|
315
|
+
# Include integration tests (requires ANTHROPIC_API_KEY)
|
|
316
|
+
pytest -m integration
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### Run linting
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
ruff check src/ tests/
|
|
323
|
+
ruff format src/ tests/
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## Contributing
|
|
327
|
+
|
|
328
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
329
|
+
|
|
330
|
+
## License
|
|
331
|
+
|
|
332
|
+
MIT License - see LICENSE file for details.
|
|
333
|
+
|
|
334
|
+
## Credits
|
|
335
|
+
|
|
336
|
+
- Built with [recipe-scrapers](https://github.com/hhursev/recipe-scrapers)
|
|
337
|
+
- LLM parsing powered by [Anthropic Claude](https://www.anthropic.com/)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
recipe_clipper/__init__.py,sha256=gXkDlSOvb2GvwDjUpQEC1w8jmxpahampu1GCtP39gAY,167
|
|
2
|
+
recipe_clipper/cli.py,sha256=WgwW7CNEU1-h9NerH8i0ORXKXNPFQoPxN72F4u3Vd88,7842
|
|
3
|
+
recipe_clipper/clipper.py,sha256=hvFqXmy0Fb-PcuWx0p1pHmRiyrYEspniW76JFP_x2OY,1690
|
|
4
|
+
recipe_clipper/exceptions.py,sha256=gUPxVA0SRLOgH65EEcYU3ZfL61HpwF3xGzbIzdkGPlU,557
|
|
5
|
+
recipe_clipper/formatters.py,sha256=BzJS0r4PLgczv9NgRgOESIlvJyzpJzm3CPt18gMr_GQ,3488
|
|
6
|
+
recipe_clipper/http.py,sha256=p27Y0RgQSJhdtujA5Jk-VrSloJSQ5M4synQftk-ZjwM,1790
|
|
7
|
+
recipe_clipper/models.py,sha256=0loKlbgNHfJ-rHiu8FYwOEcS3yxHUr-UapAMegf2eNs,1935
|
|
8
|
+
recipe_clipper/parsers/__init__.py,sha256=0Cj9Udq6SJKyYHUfEYXlQCFc2hlfpfw8EonhRUIb2Zc,22
|
|
9
|
+
recipe_clipper/parsers/llm_parser.py,sha256=CEOZxYldyGv6aO0J1FzJVI6b9Jo0vTVSVEcwx8FSNxg,10087
|
|
10
|
+
recipe_clipper/parsers/recipe_scrapers_parser.py,sha256=lnmogMzXCBOnG5XrDxB2m9pkWTDUHY1nUHlWCagA0ac,1640
|
|
11
|
+
recipe_clipper-0.1.0a0.dist-info/METADATA,sha256=P5LRPu_wsVbKm00LyW7r2SXXfupFlRLrgPH-oDs4FaQ,8805
|
|
12
|
+
recipe_clipper-0.1.0a0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
recipe_clipper-0.1.0a0.dist-info/entry_points.txt,sha256=KsfF-9OWxxR7bhlrWBBOs-MM-yyNZV3bQ7XPHw6DOTM,58
|
|
14
|
+
recipe_clipper-0.1.0a0.dist-info/licenses/LICENSE,sha256=ftq44Im9g7t_9sVoIBKknL-DdtaTskhAedC2zOQOp7o,1084
|
|
15
|
+
recipe_clipper-0.1.0a0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Recipe Clipper Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|