markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/json_order.py ADDED
@@ -0,0 +1,550 @@
1
+ """JSON field ordering definitions and utilities.
2
+
3
+ This module provides standardized field ordering for JSON output files
4
+ (report.json, state.json, assets.json) to ensure consistent, readable output.
5
+
6
+ It also handles:
7
+ - Duration formatting (seconds -> human-readable)
8
+ - Cache details merging (fetch_cache_hit + llm_cache_hit -> cache_details)
9
+ - URL hierarchy transformation (flat urls -> grouped url_files)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Any
15
+
16
+ # =============================================================================
17
+ # Field Order Definitions
18
+ # =============================================================================
19
+
20
+ # report.json top-level fields
21
+ REPORT_FIELD_ORDER = [
22
+ "version",
23
+ "generated_at",
24
+ "started_at",
25
+ "updated_at",
26
+ "log_file",
27
+ "options",
28
+ "summary",
29
+ "llm_usage",
30
+ "documents",
31
+ "url_sources",
32
+ ]
33
+
34
+ # state.json top-level fields (minimal for resume)
35
+ STATE_FIELD_ORDER = [
36
+ "version",
37
+ "options",
38
+ "documents",
39
+ "urls",
40
+ ]
41
+
42
+ # images.json top-level fields (formerly assets.json)
43
+ IMAGES_FIELD_ORDER = [
44
+ "version",
45
+ "created",
46
+ "updated",
47
+ "images",
48
+ ]
49
+
50
+ # options fields (used in both report and state)
51
+ OPTIONS_FIELD_ORDER = [
52
+ "concurrency",
53
+ "llm",
54
+ "cache",
55
+ "ocr",
56
+ "screenshot",
57
+ "alt",
58
+ "desc",
59
+ "fetch_strategy",
60
+ "models",
61
+ "input_dir",
62
+ "output_dir",
63
+ ]
64
+
65
+ # summary fields
66
+ SUMMARY_FIELD_ORDER = [
67
+ "total_documents",
68
+ "completed_documents",
69
+ "failed_documents",
70
+ "pending_documents",
71
+ "total_urls",
72
+ "completed_urls",
73
+ "failed_urls",
74
+ "pending_urls",
75
+ "url_cache_hits",
76
+ "url_sources",
77
+ "duration",
78
+ "processing_time",
79
+ ]
80
+
81
+ # llm_usage fields
82
+ LLM_USAGE_FIELD_ORDER = [
83
+ "models",
84
+ "requests",
85
+ "input_tokens",
86
+ "output_tokens",
87
+ "cost_usd",
88
+ ]
89
+
90
+ # llm_usage.models.{model} fields
91
+ LLM_MODEL_USAGE_FIELD_ORDER = [
92
+ "requests",
93
+ "input_tokens",
94
+ "output_tokens",
95
+ "cost_usd",
96
+ ]
97
+
98
+ # documents.{path} fields (document entry)
99
+ FILE_ENTRY_FIELD_ORDER = [
100
+ "status",
101
+ "cache_hit",
102
+ "output",
103
+ "error",
104
+ "started_at",
105
+ "completed_at",
106
+ "duration",
107
+ "images",
108
+ "screenshots",
109
+ "cost_usd",
110
+ "llm_usage",
111
+ ]
112
+
113
+ # url_sources.{file}.urls.{url} fields (URL entry)
114
+ URL_ENTRY_FIELD_ORDER = [
115
+ "status",
116
+ "cache_hit",
117
+ "cache_details",
118
+ "output",
119
+ "error",
120
+ "fetch_strategy",
121
+ "started_at",
122
+ "completed_at",
123
+ "duration",
124
+ "images",
125
+ "screenshots",
126
+ "cost_usd",
127
+ "llm_usage",
128
+ ]
129
+
130
+ # url_sources.{file} fields (URL source file entry)
131
+ URL_FILE_ENTRY_FIELD_ORDER = [
132
+ "total",
133
+ "completed",
134
+ "failed",
135
+ "urls",
136
+ ]
137
+
138
+ # cache_details fields
139
+ CACHE_DETAILS_FIELD_ORDER = [
140
+ "fetch",
141
+ "llm",
142
+ ]
143
+
144
+ # images[].{item} fields (formerly assets[])
145
+ # Note: llm_usage is intentionally excluded (internal tracking only)
146
+ IMAGE_ENTRY_FIELD_ORDER = [
147
+ "path",
148
+ "alt",
149
+ "desc",
150
+ "text",
151
+ "created",
152
+ "source",
153
+ ]
154
+
155
+
156
+ # =============================================================================
157
+ # Helper Functions
158
+ # =============================================================================
159
+
160
+
161
+ def _format_duration(seconds: float | None) -> str:
162
+ """Format duration in seconds to human-readable string.
163
+
164
+ Args:
165
+ seconds: Duration in seconds
166
+
167
+ Returns:
168
+ Formatted string like "00:03:06" or "32.5s" for short durations
169
+ """
170
+ if seconds is None:
171
+ return "0s"
172
+
173
+ if seconds < 60:
174
+ return f"{seconds:.1f}s"
175
+
176
+ hours, remainder = divmod(int(seconds), 3600)
177
+ minutes, secs = divmod(remainder, 60)
178
+
179
+ if hours > 0:
180
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
181
+ return f"{minutes:02d}:{secs:02d}"
182
+
183
+
184
+ # =============================================================================
185
+ # Ordering Functions
186
+ # =============================================================================
187
+
188
+
189
+ def order_dict(d: dict[str, Any], field_order: list[str]) -> dict[str, Any]:
190
+ """Reorder dict keys according to field_order.
191
+
192
+ Fields in field_order come first (in that order), followed by
193
+ any remaining fields in their original order.
194
+
195
+ Args:
196
+ d: Dictionary to reorder
197
+ field_order: List of field names in desired order
198
+
199
+ Returns:
200
+ New dict with reordered keys
201
+ """
202
+ if not isinstance(d, dict):
203
+ return d
204
+
205
+ ordered: dict[str, Any] = {}
206
+
207
+ # First, add fields in the specified order
208
+ for key in field_order:
209
+ if key in d:
210
+ ordered[key] = d[key]
211
+
212
+ # Then, add any remaining fields not in the order list
213
+ for key in d:
214
+ if key not in ordered:
215
+ ordered[key] = d[key]
216
+
217
+ return ordered
218
+
219
+
220
+ def order_dict_keys_sorted(d: dict[str, Any]) -> dict[str, Any]:
221
+ """Reorder dict keys alphabetically.
222
+
223
+ Args:
224
+ d: Dictionary to reorder
225
+
226
+ Returns:
227
+ New dict with alphabetically sorted keys
228
+ """
229
+ if not isinstance(d, dict):
230
+ return d
231
+
232
+ return {k: d[k] for k in sorted(d.keys())}
233
+
234
+
235
+ def _order_llm_usage(llm_usage: dict[str, Any]) -> dict[str, Any]:
236
+ """Order llm_usage structure.
237
+
238
+ Orders top-level fields and nested model usage fields.
239
+ """
240
+ if not llm_usage:
241
+ return llm_usage
242
+
243
+ result = order_dict(llm_usage, LLM_USAGE_FIELD_ORDER)
244
+
245
+ # Order models sub-dict
246
+ if "models" in result and isinstance(result["models"], dict):
247
+ ordered_models = {}
248
+ for model in sorted(result["models"].keys()):
249
+ ordered_models[model] = order_dict(
250
+ result["models"][model], LLM_MODEL_USAGE_FIELD_ORDER
251
+ )
252
+ result["models"] = ordered_models
253
+
254
+ return result
255
+
256
+
257
+ def _transform_file_entry(entry: dict[str, Any]) -> dict[str, Any]:
258
+ """Transform and order a file entry (local file).
259
+
260
+ Converts duration to human-readable format and orders fields.
261
+ """
262
+ result = dict(entry) # Copy
263
+
264
+ # Convert duration to human-readable format
265
+ if "duration" in result and isinstance(result["duration"], (int, float)):
266
+ result["duration"] = _format_duration(result["duration"])
267
+
268
+ # Order the result
269
+ result = order_dict(result, FILE_ENTRY_FIELD_ORDER)
270
+
271
+ # Order nested llm_usage
272
+ if "llm_usage" in result and isinstance(result["llm_usage"], dict):
273
+ ordered_usage = {}
274
+ for model in sorted(result["llm_usage"].keys()):
275
+ ordered_usage[model] = order_dict(
276
+ result["llm_usage"][model], LLM_MODEL_USAGE_FIELD_ORDER
277
+ )
278
+ result["llm_usage"] = ordered_usage
279
+
280
+ return result
281
+
282
+
283
+ def _transform_url_entry(entry: dict[str, Any]) -> dict[str, Any]:
284
+ """Transform and order a URL entry.
285
+
286
+ Builds cache_details from separate cache fields and converts duration.
287
+ """
288
+ result = dict(entry) # Copy
289
+
290
+ # Build cache_details from separate cache fields (if present)
291
+ fetch_hit = result.pop("fetch_cache_hit", None)
292
+ llm_hit = result.pop("llm_cache_hit", None)
293
+
294
+ if fetch_hit is not None or llm_hit is not None:
295
+ cache_hit = bool(fetch_hit) or bool(llm_hit)
296
+ result["cache_hit"] = cache_hit
297
+ result["cache_details"] = order_dict(
298
+ {
299
+ "fetch": bool(fetch_hit) if fetch_hit is not None else False,
300
+ "llm": bool(llm_hit) if llm_hit is not None else False,
301
+ },
302
+ CACHE_DETAILS_FIELD_ORDER,
303
+ )
304
+
305
+ # Convert duration to human-readable format
306
+ if "duration" in result and isinstance(result["duration"], (int, float)):
307
+ result["duration"] = _format_duration(result["duration"])
308
+
309
+ # Order the result
310
+ result = order_dict(result, URL_ENTRY_FIELD_ORDER)
311
+
312
+ # Order nested llm_usage
313
+ if "llm_usage" in result and isinstance(result["llm_usage"], dict):
314
+ ordered_usage = {}
315
+ for model in sorted(result["llm_usage"].keys()):
316
+ ordered_usage[model] = order_dict(
317
+ result["llm_usage"][model], LLM_MODEL_USAGE_FIELD_ORDER
318
+ )
319
+ result["llm_usage"] = ordered_usage
320
+
321
+ return result
322
+
323
+
324
+ def _order_image_entry(entry: dict[str, Any]) -> dict[str, Any]:
325
+ """Order an image entry (formerly asset entry)."""
326
+ result = order_dict(entry, IMAGE_ENTRY_FIELD_ORDER)
327
+
328
+ # Order nested llm_usage
329
+ if "llm_usage" in result and isinstance(result["llm_usage"], dict):
330
+ ordered_usage = {}
331
+ for model in sorted(result["llm_usage"].keys()):
332
+ ordered_usage[model] = order_dict(
333
+ result["llm_usage"][model], LLM_MODEL_USAGE_FIELD_ORDER
334
+ )
335
+ result["llm_usage"] = ordered_usage
336
+
337
+ return result
338
+
339
+
340
+ def _transform_summary(summary: dict[str, Any]) -> dict[str, Any]:
341
+ """Transform and order summary structure.
342
+
343
+ Converts duration fields to human-readable format.
344
+ """
345
+ result = dict(summary)
346
+
347
+ # Convert duration to human-readable format
348
+ if "duration" in result and isinstance(result["duration"], (int, float)):
349
+ result["duration"] = _format_duration(result["duration"])
350
+
351
+ # Convert processing_time to human-readable format
352
+ if "processing_time" in result and isinstance(
353
+ result["processing_time"], (int, float)
354
+ ):
355
+ result["processing_time"] = _format_duration(result["processing_time"])
356
+
357
+ return order_dict(result, SUMMARY_FIELD_ORDER)
358
+
359
+
360
+ def order_report(report: dict[str, Any]) -> dict[str, Any]:
361
+ """Order and transform all fields in a report.json structure.
362
+
363
+ This function:
364
+ 1. Converts durations to human-readable format
365
+ 2. Builds cache_details for URL entries
366
+ 3. Transforms flat urls to hierarchical url_files
367
+ 4. Orders all fields according to specification
368
+
369
+ Args:
370
+ report: Report dictionary
371
+
372
+ Returns:
373
+ New dict with all fields properly transformed and ordered
374
+ """
375
+ result = dict(report) # Copy
376
+
377
+ # Order options
378
+ if "options" in result and isinstance(result["options"], dict):
379
+ result["options"] = order_dict(result["options"], OPTIONS_FIELD_ORDER)
380
+
381
+ # Transform summary (duration formatting)
382
+ if "summary" in result and isinstance(result["summary"], dict):
383
+ result["summary"] = _transform_summary(result["summary"])
384
+
385
+ # Order llm_usage
386
+ if "llm_usage" in result and isinstance(result["llm_usage"], dict):
387
+ result["llm_usage"] = _order_llm_usage(result["llm_usage"])
388
+
389
+ # Order documents
390
+ if "documents" in result and isinstance(result["documents"], dict):
391
+ ordered_files = {}
392
+ for path in sorted(result["documents"].keys()):
393
+ ordered_files[path] = _transform_file_entry(result["documents"][path])
394
+ result["documents"] = ordered_files
395
+
396
+ # Transform urls (flat dict) -> url_sources (hierarchical by source_file)
397
+ if "urls" in result and isinstance(result["urls"], dict):
398
+ url_sources: dict[str, dict[str, Any]] = {}
399
+
400
+ for url, url_data in result["urls"].items():
401
+ source_file = url_data.get("source_file", "unknown.urls")
402
+ if source_file not in url_sources:
403
+ url_sources[source_file] = {
404
+ "total": 0,
405
+ "completed": 0,
406
+ "failed": 0,
407
+ "urls": {},
408
+ }
409
+
410
+ # Count status
411
+ url_sources[source_file]["total"] += 1
412
+ status = url_data.get("status", "pending")
413
+ if status == "completed":
414
+ url_sources[source_file]["completed"] += 1
415
+ elif status == "failed":
416
+ url_sources[source_file]["failed"] += 1
417
+
418
+ # Add URL entry (without source_file - redundant in this structure)
419
+ entry_data = {k: v for k, v in url_data.items() if k != "source_file"}
420
+ url_sources[source_file]["urls"][url] = entry_data
421
+
422
+ # Transform each file entry
423
+ ordered_url_sources = {}
424
+ for file_name in sorted(url_sources.keys()):
425
+ file_entry = url_sources[file_name]
426
+ ordered_entry = order_dict(file_entry, URL_FILE_ENTRY_FIELD_ORDER)
427
+
428
+ # Transform urls within the file
429
+ if "urls" in ordered_entry and isinstance(ordered_entry["urls"], dict):
430
+ ordered_urls = {}
431
+ for url, url_data in ordered_entry["urls"].items():
432
+ ordered_urls[url] = _transform_url_entry(url_data)
433
+ ordered_entry["urls"] = ordered_urls
434
+
435
+ ordered_url_sources[file_name] = ordered_entry
436
+
437
+ result["url_sources"] = ordered_url_sources
438
+ del result["urls"]
439
+
440
+ # Transform url_sources if already in hierarchical format
441
+ elif "url_sources" in result and isinstance(result["url_sources"], dict):
442
+ ordered_url_sources = {}
443
+ for file_name in sorted(result["url_sources"].keys()):
444
+ file_entry = result["url_sources"][file_name]
445
+ ordered_entry = order_dict(file_entry, URL_FILE_ENTRY_FIELD_ORDER)
446
+
447
+ if "urls" in ordered_entry and isinstance(ordered_entry["urls"], dict):
448
+ ordered_urls = {}
449
+ for url, url_data in ordered_entry["urls"].items():
450
+ ordered_urls[url] = _transform_url_entry(url_data)
451
+ ordered_entry["urls"] = ordered_urls
452
+
453
+ ordered_url_sources[file_name] = ordered_entry
454
+ result["url_sources"] = ordered_url_sources
455
+
456
+ # Order top-level fields
457
+ return order_dict(result, REPORT_FIELD_ORDER)
458
+
459
+
460
+ def order_state(state: dict[str, Any]) -> dict[str, Any]:
461
+ """Order all fields in a state.json structure.
462
+
463
+ Note: state.json keeps field values as-is (no duration formatting)
464
+ for resume compatibility. Only ordering is applied.
465
+
466
+ Args:
467
+ state: State dictionary
468
+
469
+ Returns:
470
+ New dict with all fields properly ordered
471
+ """
472
+ result = order_dict(dict(state), STATE_FIELD_ORDER)
473
+
474
+ # Order options
475
+ if "options" in result and isinstance(result["options"], dict):
476
+ result["options"] = order_dict(result["options"], OPTIONS_FIELD_ORDER)
477
+
478
+ # Order documents (alphabetically by path)
479
+ if "documents" in result and isinstance(result["documents"], dict):
480
+ ordered_docs = {}
481
+ for path in sorted(result["documents"].keys()):
482
+ ordered_docs[path] = order_dict(
483
+ result["documents"][path], FILE_ENTRY_FIELD_ORDER
484
+ )
485
+ # Order nested llm_usage
486
+ if "llm_usage" in ordered_docs[path]:
487
+ llm_usage = ordered_docs[path]["llm_usage"]
488
+ if isinstance(llm_usage, dict):
489
+ ordered_usage = {}
490
+ for model in sorted(llm_usage.keys()):
491
+ ordered_usage[model] = order_dict(
492
+ llm_usage[model], LLM_MODEL_USAGE_FIELD_ORDER
493
+ )
494
+ ordered_docs[path]["llm_usage"] = ordered_usage
495
+ result["documents"] = ordered_docs
496
+
497
+ # Order urls (preserve original order for resume compatibility)
498
+ if "urls" in result and isinstance(result["urls"], dict):
499
+ ordered_urls = {}
500
+ for url, url_data in result["urls"].items():
501
+ ordered_urls[url] = order_dict(url_data, URL_ENTRY_FIELD_ORDER)
502
+ # Order nested llm_usage
503
+ if "llm_usage" in ordered_urls[url]:
504
+ llm_usage = ordered_urls[url]["llm_usage"]
505
+ if isinstance(llm_usage, dict):
506
+ ordered_usage = {}
507
+ for model in sorted(llm_usage.keys()):
508
+ ordered_usage[model] = order_dict(
509
+ llm_usage[model], LLM_MODEL_USAGE_FIELD_ORDER
510
+ )
511
+ ordered_urls[url]["llm_usage"] = ordered_usage
512
+ result["urls"] = ordered_urls
513
+
514
+ return result
515
+
516
+
517
+ def order_images(images: dict[str, Any]) -> dict[str, Any]:
518
+ """Order all fields in an images.json structure (formerly assets.json).
519
+
520
+ Also handles field name migration:
521
+ - assets -> images
522
+ - asset -> path (within each entry)
523
+
524
+ Args:
525
+ images: Images dictionary
526
+
527
+ Returns:
528
+ New dict with all fields properly ordered
529
+ """
530
+ result = order_dict(dict(images), IMAGES_FIELD_ORDER)
531
+
532
+ # Order each image entry
533
+ if "images" in result and isinstance(result["images"], list):
534
+ ordered_images = []
535
+ for image in result["images"]:
536
+ ordered_image = order_dict(image, IMAGE_ENTRY_FIELD_ORDER)
537
+ # Order nested llm_usage
538
+ if "llm_usage" in ordered_image and isinstance(
539
+ ordered_image["llm_usage"], dict
540
+ ):
541
+ ordered_usage = {}
542
+ for model in sorted(ordered_image["llm_usage"].keys()):
543
+ ordered_usage[model] = order_dict(
544
+ ordered_image["llm_usage"][model], LLM_MODEL_USAGE_FIELD_ORDER
545
+ )
546
+ ordered_image["llm_usage"] = ordered_usage
547
+ ordered_images.append(ordered_image)
548
+ result["images"] = ordered_images
549
+
550
+ return result