markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,509 @@
1
+ """Helper utilities for workflow processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ import yaml
12
+ from loguru import logger
13
+
14
+ from markitai.json_order import order_images
15
+ from markitai.security import atomic_write_json
16
+ from markitai.utils.paths import ensure_dir
17
+
18
+ if TYPE_CHECKING:
19
+ from markitai.config import MarkitaiConfig
20
+ from markitai.llm import ImageAnalysis, LLMProcessor, LLMRuntime
21
+ from markitai.workflow.single import ImageAnalysisResult
22
+
23
+ # Canonical frontmatter field order
24
+ FRONTMATTER_FIELD_ORDER = [
25
+ "title",
26
+ "source",
27
+ "description",
28
+ "tags",
29
+ "markitai_processed",
30
+ ]
31
+
32
+
33
+ def normalize_frontmatter(frontmatter: str | dict[str, Any]) -> str:
34
+ """Normalize frontmatter to ensure consistent field order.
35
+
36
+ Parses the frontmatter (if string), reorders fields according to
37
+ FRONTMATTER_FIELD_ORDER, and outputs clean YAML without markers.
38
+
39
+ Args:
40
+ frontmatter: YAML string (with or without --- markers) or dict
41
+
42
+ Returns:
43
+ Normalized YAML string without --- markers
44
+ """
45
+ if isinstance(frontmatter, str):
46
+ # Remove --- markers and code block markers
47
+ cleaned = frontmatter.strip()
48
+ # Remove ```yaml ... ``` wrapper
49
+ code_block_pattern = r"^```(?:ya?ml)?\s*\n?(.*?)\n?```$"
50
+ match = re.match(code_block_pattern, cleaned, re.DOTALL | re.IGNORECASE)
51
+ if match:
52
+ cleaned = match.group(1).strip()
53
+ # Remove --- markers
54
+ if cleaned.startswith("---"):
55
+ cleaned = cleaned[3:].strip()
56
+ if cleaned.endswith("---"):
57
+ cleaned = cleaned[:-3].strip()
58
+
59
+ try:
60
+ data = yaml.safe_load(cleaned) or {}
61
+ except yaml.YAMLError:
62
+ # If parsing fails, return as-is
63
+ return cleaned
64
+ else:
65
+ data = frontmatter
66
+
67
+ if not isinstance(data, dict):
68
+ return str(data)
69
+
70
+ # Build ordered output
71
+ ordered_lines = []
72
+
73
+ def format_field(field: str, value: Any) -> str:
74
+ """Format a single field as valid YAML."""
75
+ # Use yaml.dump for proper escaping of special characters
76
+ # default_flow_style=False ensures block style (key: value, not {key: value})
77
+ formatted = yaml.dump(
78
+ {field: value},
79
+ allow_unicode=True,
80
+ default_flow_style=False,
81
+ width=1000, # Prevent line wrapping
82
+ ).strip()
83
+ return formatted
84
+
85
+ # First, add fields in canonical order
86
+ for field in FRONTMATTER_FIELD_ORDER:
87
+ if field in data:
88
+ ordered_lines.append(format_field(field, data[field]))
89
+
90
+ # Then, add any remaining fields not in the canonical order
91
+ for field, value in data.items():
92
+ if field not in FRONTMATTER_FIELD_ORDER:
93
+ ordered_lines.append(format_field(field, value))
94
+
95
+ return "\n".join(ordered_lines)
96
+
97
+
98
+ # Language code to full name mapping for LLM prompts
99
+ LANGUAGE_NAMES: dict[str, str] = {
100
+ "zh": "Chinese",
101
+ "en": "English",
102
+ }
103
+
104
+
105
+ def detect_language(content: str) -> str:
106
+ """Detect the primary language of the content.
107
+
108
+ Uses a simple heuristic: if more than 10% of characters are CJK,
109
+ consider it Chinese.
110
+
111
+ Args:
112
+ content: Text content to analyze
113
+
114
+ Returns:
115
+ Language code: "zh" for Chinese, "en" for English/other
116
+ """
117
+ if not content:
118
+ return "en"
119
+
120
+ # Count CJK characters (Chinese, Japanese, Korean)
121
+ cjk_count = 0
122
+ total_count = 0
123
+
124
+ for char in content:
125
+ if char.isalpha():
126
+ total_count += 1
127
+ # CJK Unified Ideographs range
128
+ if "\u4e00" <= char <= "\u9fff":
129
+ cjk_count += 1
130
+
131
+ if total_count == 0:
132
+ return "en"
133
+
134
+ # If more than 10% CJK characters, consider it Chinese
135
+ if cjk_count / total_count > 0.1:
136
+ return "zh"
137
+
138
+ return "en"
139
+
140
+
141
+ def get_language_name(language_code: str) -> str:
142
+ """Get full language name from language code.
143
+
144
+ Args:
145
+ language_code: Language code ("zh" or "en")
146
+
147
+ Returns:
148
+ Full language name ("Chinese" or "English")
149
+ """
150
+ return LANGUAGE_NAMES.get(language_code, "English")
151
+
152
+
153
+ def add_basic_frontmatter(
154
+ content: str,
155
+ source: str,
156
+ fetch_strategy: str | None = None,
157
+ screenshot_path: Path | None = None,
158
+ output_dir: Path | None = None,
159
+ dedupe: bool = True,
160
+ ) -> str:
161
+ """Add basic frontmatter (title, source, markitai_processed) to markdown content.
162
+
163
+ Used for .md files that don't go through full LLM processing.
164
+
165
+ Args:
166
+ content: Markdown content
167
+ source: Source file name or URL
168
+ fetch_strategy: Optional fetch strategy used (e.g., "static", "browser")
169
+ screenshot_path: Optional path to page screenshot
170
+ output_dir: Optional output directory (for relative screenshot path)
171
+ dedupe: Whether to deduplicate paragraphs (default True)
172
+
173
+ Returns:
174
+ Content with basic frontmatter prepended
175
+ """
176
+ from markitai.utils.text import dedupe_long_text_blocks, dedupe_paragraphs
177
+
178
+ # Apply deduplication for browser-fetched content
179
+ if dedupe:
180
+ # First pass: paragraph-level deduplication (for standard markdown)
181
+ content = dedupe_paragraphs(content)
182
+ # Second pass: long text block deduplication (for social media content)
183
+ content = dedupe_long_text_blocks(content)
184
+
185
+ # Extract title from first heading or use source name
186
+ title = source
187
+ lines = content.strip().split("\n")
188
+ for line in lines:
189
+ if line.startswith("#"):
190
+ # Remove # and ** markers, strip whitespace
191
+ title = line.lstrip("#").strip()
192
+ title = title.replace("**", "").strip()
193
+ if title:
194
+ break
195
+
196
+ timestamp = datetime.now().astimezone().isoformat()
197
+
198
+ frontmatter_dict: dict[str, Any] = {
199
+ "title": title,
200
+ "source": source,
201
+ "markitai_processed": timestamp,
202
+ }
203
+
204
+ # Add fetch_strategy if provided
205
+ if fetch_strategy:
206
+ frontmatter_dict["fetch_strategy"] = fetch_strategy
207
+
208
+ frontmatter_yaml = normalize_frontmatter(frontmatter_dict)
209
+
210
+ result = f"---\n{frontmatter_yaml}\n---\n\n{content}"
211
+
212
+ # Add screenshot reference as HTML comment at the end
213
+ if screenshot_path and screenshot_path.exists():
214
+ if output_dir:
215
+ # Calculate relative path from output file to screenshot
216
+ try:
217
+ rel_path = screenshot_path.relative_to(output_dir)
218
+ except ValueError:
219
+ rel_path = screenshot_path
220
+ else:
221
+ rel_path = screenshot_path.name
222
+
223
+ # Add screenshot reference at the end
224
+ result = (
225
+ result.rstrip()
226
+ + f"\n\n<!-- Screenshot for reference -->\n<!-- ![Screenshot]({rel_path}) -->\n"
227
+ )
228
+
229
+ return result
230
+
231
+
232
+ def merge_llm_usage(
233
+ target: dict[str, dict[str, Any]],
234
+ source: dict[str, dict[str, Any]],
235
+ ) -> None:
236
+ """Merge LLM usage statistics from source into target.
237
+
238
+ Args:
239
+ target: Target dict to merge into (modified in place)
240
+ source: Source dict to merge from
241
+ """
242
+ for model, usage in source.items():
243
+ if model not in target:
244
+ target[model] = {
245
+ "requests": 0,
246
+ "input_tokens": 0,
247
+ "output_tokens": 0,
248
+ "cost_usd": 0.0,
249
+ }
250
+ # Use .get() for robustness in case target has incomplete fields
251
+ target[model]["requests"] = target[model].get("requests", 0) + usage.get(
252
+ "requests", 0
253
+ )
254
+ target[model]["input_tokens"] = target[model].get(
255
+ "input_tokens", 0
256
+ ) + usage.get("input_tokens", 0)
257
+ target[model]["output_tokens"] = target[model].get(
258
+ "output_tokens", 0
259
+ ) + usage.get("output_tokens", 0)
260
+ target[model]["cost_usd"] = target[model].get("cost_usd", 0.0) + usage.get(
261
+ "cost_usd", 0.0
262
+ )
263
+
264
+
265
+ class LLMUsageAccumulator:
266
+ """Accumulator for LLM usage statistics and cost.
267
+
268
+ This class centralizes the common pattern of tracking LLM cost and usage,
269
+ reducing the boilerplate of `llm_cost += cost` and `merge_llm_usage(usage, new_usage)`.
270
+
271
+ Example:
272
+ >>> acc = LLMUsageAccumulator()
273
+ >>> # After each LLM call
274
+ >>> acc.add(cost=0.05, usage={"gpt-4": {"requests": 1, ...}})
275
+ >>> acc.add(cost=0.03, usage={"gpt-4": {"requests": 1, ...}})
276
+ >>> # Get totals
277
+ >>> print(f"Total cost: ${acc.total_cost:.4f}")
278
+ >>> print(f"Usage by model: {acc.usage}")
279
+ """
280
+
281
+ def __init__(self) -> None:
282
+ """Initialize accumulator with zero cost and empty usage."""
283
+ self.total_cost: float = 0.0
284
+ self.usage: dict[str, dict[str, Any]] = {}
285
+
286
+ def add(
287
+ self, cost: float = 0.0, usage: dict[str, dict[str, Any]] | None = None
288
+ ) -> None:
289
+ """Add cost and usage statistics.
290
+
291
+ Args:
292
+ cost: Cost in USD to add
293
+ usage: Usage statistics by model to merge
294
+ """
295
+ self.total_cost += cost
296
+ if usage:
297
+ merge_llm_usage(self.usage, usage)
298
+
299
+ def reset(self) -> None:
300
+ """Reset accumulator to initial state."""
301
+ self.total_cost = 0.0
302
+ self.usage = {}
303
+
304
+
305
+ def write_images_json(
306
+ output_dir: Path,
307
+ analysis_results: list[ImageAnalysisResult],
308
+ ) -> list[Path]:
309
+ """Write or merge image descriptions to JSON files in each assets directory.
310
+
311
+ Each assets directory (e.g., output/assets/, output/sub_dir/assets/) gets
312
+ its own images.json file containing only the images from that directory.
313
+
314
+ Args:
315
+ output_dir: Output directory
316
+ analysis_results: List of ImageAnalysisResult objects
317
+
318
+ Returns:
319
+ List of paths to created/updated JSON files
320
+ """
321
+ if not analysis_results:
322
+ return []
323
+
324
+ # Group images by their containing assets directory
325
+ # Key: assets_dir path, Value: list of (source_file, image_dict) tuples
326
+ images_by_dir: dict[Path, list[tuple[str, dict[str, Any]]]] = {}
327
+
328
+ for result in analysis_results:
329
+ if not result.assets:
330
+ continue
331
+
332
+ for asset in result.assets:
333
+ # Determine assets directory from the image path
334
+ # Note: asset dict uses "asset" key internally, will be renamed to "path" in output
335
+ image_path = Path(asset.get("asset", ""))
336
+ if image_path.parent.name == "assets":
337
+ assets_dir = image_path.parent
338
+ else:
339
+ # Fallback to default assets directory
340
+ assets_dir = output_dir / "assets"
341
+
342
+ if assets_dir not in images_by_dir:
343
+ images_by_dir[assets_dir] = []
344
+ images_by_dir[assets_dir].append((result.source_file, asset))
345
+
346
+ # Write an images.json file for each assets directory
347
+ created_files: list[Path] = []
348
+ local_now = datetime.now().astimezone().isoformat()
349
+
350
+ for assets_dir, image_entries in images_by_dir.items():
351
+ # Check for both old (assets.json) and new (images.json) filenames
352
+ json_file = assets_dir / "images.json"
353
+ old_json_file = assets_dir / "assets.json"
354
+
355
+ # Load existing data if file exists (prefer new name, fallback to old)
356
+ existing_data: dict[str, Any] = {}
357
+ if json_file.exists():
358
+ try:
359
+ existing_data = json.loads(json_file.read_text(encoding="utf-8"))
360
+ except (json.JSONDecodeError, OSError):
361
+ existing_data = {}
362
+ elif old_json_file.exists():
363
+ try:
364
+ existing_data = json.loads(old_json_file.read_text(encoding="utf-8"))
365
+ except (json.JSONDecodeError, OSError):
366
+ existing_data = {}
367
+
368
+ # Build images map keyed by path (merge with existing)
369
+ # Support both old (assets/asset) and new (images/path) field names
370
+ images_map: dict[str, dict[str, Any]] = {}
371
+ existing_images = existing_data.get("images") or existing_data.get("assets", [])
372
+ for existing_image in existing_images:
373
+ if isinstance(existing_image, dict):
374
+ # Get path from either "path" (new) or "asset" (old)
375
+ img_path = existing_image.get("path") or existing_image.get("asset", "")
376
+ if img_path:
377
+ images_map[img_path] = existing_image
378
+
379
+ # Add/update images from this batch
380
+ for source_file, asset in image_entries:
381
+ # Convert internal "asset" key to "path" for output
382
+ # Filter out llm_usage (internal tracking, not needed in output)
383
+ image_entry = {k: v for k, v in asset.items() if k != "llm_usage"}
384
+ image_entry["source"] = source_file
385
+ if "asset" in image_entry:
386
+ image_entry["path"] = image_entry.pop("asset")
387
+ images_map[image_entry.get("path", "")] = image_entry
388
+
389
+ # Build final JSON structure
390
+ images_json = {
391
+ "version": "1.0",
392
+ "created": existing_data.get("created", local_now),
393
+ "updated": local_now,
394
+ "images": list(images_map.values()),
395
+ }
396
+
397
+ ensure_dir(assets_dir)
398
+ atomic_write_json(json_file, images_json, order_func=order_images)
399
+ created_files.append(json_file)
400
+
401
+ # Log summary of created files
402
+ if created_files:
403
+ if len(created_files) == 1:
404
+ logger.info(f"Image descriptions saved: {created_files[0]}")
405
+ else:
406
+ logger.info(f"Asset descriptions saved: {len(created_files)} files")
407
+
408
+ return created_files
409
+
410
+
411
+ def format_standalone_image_markdown(
412
+ input_path: Path,
413
+ analysis: ImageAnalysis,
414
+ image_ref_path: str,
415
+ include_frontmatter: bool = False,
416
+ ) -> str:
417
+ """Format analysis results for a standalone image file.
418
+
419
+ Creates a rich markdown document with:
420
+ - Optional frontmatter (for .llm.md files)
421
+ - Title (image filename)
422
+ - Image preview
423
+ - Image description section
424
+ - Extracted text section (if any text was found)
425
+
426
+ Args:
427
+ input_path: Original image file path
428
+ analysis: ImageAnalysis result with caption, description, extracted_text
429
+ image_ref_path: Relative path for image reference
430
+ include_frontmatter: Whether to include YAML frontmatter
431
+
432
+ Returns:
433
+ Formatted markdown string
434
+ """
435
+ sections = []
436
+
437
+ # Frontmatter (for .llm.md files)
438
+ if include_frontmatter:
439
+ timestamp = datetime.now().astimezone().isoformat()
440
+ frontmatter_lines = [
441
+ "---",
442
+ f"title: {input_path.stem}",
443
+ f"description: {analysis.caption}",
444
+ f"source: {input_path.name}",
445
+ "tags:",
446
+ "- image",
447
+ "- analysis",
448
+ f"markitai_processed: {timestamp}",
449
+ "---",
450
+ "",
451
+ ]
452
+ sections.append("\n".join(frontmatter_lines))
453
+
454
+ # Title
455
+ sections.append(f"# {input_path.stem}\n")
456
+
457
+ # Image preview with alt text
458
+ sections.append(f"![{analysis.caption}]({image_ref_path})\n")
459
+
460
+ # Image description section
461
+ if analysis.description:
462
+ desc = analysis.description.strip()
463
+ # Only add section header if description doesn't already start with a header
464
+ if not desc.startswith("#"):
465
+ sections.append("## Image Description\n")
466
+ sections.append(f"{desc}\n")
467
+
468
+ # Extracted text section (only if text was found)
469
+ if analysis.extracted_text and analysis.extracted_text.strip():
470
+ sections.append("## Extracted Text\n")
471
+ sections.append(f"```\n{analysis.extracted_text}\n```\n")
472
+
473
+ return "\n".join(sections)
474
+
475
+
476
+ def create_llm_processor(
477
+ config: MarkitaiConfig,
478
+ project_dir: Path | None = None,
479
+ runtime: LLMRuntime | None = None,
480
+ ) -> LLMProcessor:
481
+ """Create an LLMProcessor instance from configuration.
482
+
483
+ This is a factory function to centralize LLMProcessor instantiation,
484
+ reducing code duplication across CLI and workflow modules.
485
+
486
+ Args:
487
+ config: Markitai configuration object
488
+ project_dir: Optional project directory for project-level cache.
489
+ If None, only global cache is used.
490
+ runtime: Optional shared runtime for concurrency control.
491
+ If provided, uses runtime's semaphore instead of creating one.
492
+
493
+ Returns:
494
+ Configured LLMProcessor instance
495
+
496
+ Example:
497
+ >>> processor = create_llm_processor(cfg, project_dir=output_dir.parent)
498
+ >>> result = await processor.process_document(content)
499
+ """
500
+ from markitai.llm import LLMProcessor
501
+
502
+ return LLMProcessor(
503
+ config.llm,
504
+ config.prompts,
505
+ runtime=runtime,
506
+ project_dir=project_dir,
507
+ no_cache=config.cache.no_cache,
508
+ no_cache_patterns=config.cache.no_cache_patterns,
509
+ )