markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,748 @@
1
+ {
2
+ "$defs": {
3
+ "AgentBrowserConfig": {
4
+ "description": "agent-browser configuration for JS-rendered pages.",
5
+ "properties": {
6
+ "command": {
7
+ "default": "agent-browser",
8
+ "title": "Command",
9
+ "type": "string"
10
+ },
11
+ "timeout": {
12
+ "default": 30000,
13
+ "title": "Timeout",
14
+ "type": "integer"
15
+ },
16
+ "wait_for": {
17
+ "default": "domcontentloaded",
18
+ "enum": [
19
+ "load",
20
+ "domcontentloaded",
21
+ "networkidle"
22
+ ],
23
+ "title": "Wait For",
24
+ "type": "string"
25
+ },
26
+ "extra_wait_ms": {
27
+ "default": 1000,
28
+ "title": "Extra Wait Ms",
29
+ "type": "integer"
30
+ },
31
+ "session": {
32
+ "anyOf": [
33
+ {
34
+ "type": "string"
35
+ },
36
+ {
37
+ "type": "null"
38
+ }
39
+ ],
40
+ "default": null,
41
+ "title": "Session"
42
+ }
43
+ },
44
+ "title": "AgentBrowserConfig",
45
+ "type": "object"
46
+ },
47
+ "BatchConfig": {
48
+ "description": "Batch processing configuration.",
49
+ "properties": {
50
+ "concurrency": {
51
+ "default": 10,
52
+ "minimum": 1,
53
+ "title": "Concurrency",
54
+ "type": "integer"
55
+ },
56
+ "url_concurrency": {
57
+ "default": 5,
58
+ "minimum": 1,
59
+ "title": "Url Concurrency",
60
+ "type": "integer"
61
+ },
62
+ "state_flush_interval_seconds": {
63
+ "default": 10,
64
+ "title": "State Flush Interval Seconds",
65
+ "type": "integer"
66
+ },
67
+ "scan_max_depth": {
68
+ "default": 5,
69
+ "minimum": 1,
70
+ "title": "Scan Max Depth",
71
+ "type": "integer"
72
+ },
73
+ "scan_max_files": {
74
+ "default": 10000,
75
+ "minimum": 1,
76
+ "title": "Scan Max Files",
77
+ "type": "integer"
78
+ }
79
+ },
80
+ "title": "BatchConfig",
81
+ "type": "object"
82
+ },
83
+ "CacheConfig": {
84
+ "description": "Cache configuration.",
85
+ "properties": {
86
+ "enabled": {
87
+ "default": true,
88
+ "title": "Enabled",
89
+ "type": "boolean"
90
+ },
91
+ "no_cache": {
92
+ "default": false,
93
+ "title": "No Cache",
94
+ "type": "boolean"
95
+ },
96
+ "no_cache_patterns": {
97
+ "default": [],
98
+ "items": {
99
+ "type": "string"
100
+ },
101
+ "title": "No Cache Patterns",
102
+ "type": "array"
103
+ },
104
+ "max_size_bytes": {
105
+ "default": 536870912,
106
+ "title": "Max Size Bytes",
107
+ "type": "integer"
108
+ },
109
+ "global_dir": {
110
+ "default": "~/.markitai",
111
+ "title": "Global Dir",
112
+ "type": "string"
113
+ }
114
+ },
115
+ "title": "CacheConfig",
116
+ "type": "object"
117
+ },
118
+ "FetchConfig": {
119
+ "description": "URL fetch configuration for handling static and JS-rendered pages.",
120
+ "properties": {
121
+ "strategy": {
122
+ "default": "auto",
123
+ "enum": [
124
+ "auto",
125
+ "static",
126
+ "browser",
127
+ "jina"
128
+ ],
129
+ "title": "Strategy",
130
+ "type": "string"
131
+ },
132
+ "agent_browser": {
133
+ "$ref": "#/$defs/AgentBrowserConfig"
134
+ },
135
+ "jina": {
136
+ "$ref": "#/$defs/JinaConfig"
137
+ },
138
+ "fallback_patterns": {
139
+ "items": {
140
+ "type": "string"
141
+ },
142
+ "title": "Fallback Patterns",
143
+ "type": "array"
144
+ }
145
+ },
146
+ "title": "FetchConfig",
147
+ "type": "object"
148
+ },
149
+ "ImageConfig": {
150
+ "description": "Image processing configuration.",
151
+ "properties": {
152
+ "alt_enabled": {
153
+ "default": false,
154
+ "title": "Alt Enabled",
155
+ "type": "boolean"
156
+ },
157
+ "desc_enabled": {
158
+ "default": false,
159
+ "title": "Desc Enabled",
160
+ "type": "boolean"
161
+ },
162
+ "compress": {
163
+ "default": true,
164
+ "title": "Compress",
165
+ "type": "boolean"
166
+ },
167
+ "quality": {
168
+ "default": 75,
169
+ "maximum": 100,
170
+ "minimum": 1,
171
+ "title": "Quality",
172
+ "type": "integer"
173
+ },
174
+ "format": {
175
+ "default": "jpeg",
176
+ "enum": [
177
+ "jpeg",
178
+ "png",
179
+ "webp"
180
+ ],
181
+ "title": "Format",
182
+ "type": "string"
183
+ },
184
+ "max_width": {
185
+ "default": 1920,
186
+ "title": "Max Width",
187
+ "type": "integer"
188
+ },
189
+ "max_height": {
190
+ "default": 99999,
191
+ "title": "Max Height",
192
+ "type": "integer"
193
+ },
194
+ "filter": {
195
+ "$ref": "#/$defs/ImageFilterConfig"
196
+ }
197
+ },
198
+ "title": "ImageConfig",
199
+ "type": "object"
200
+ },
201
+ "ImageFilterConfig": {
202
+ "description": "Image filter configuration.",
203
+ "properties": {
204
+ "min_width": {
205
+ "default": 50,
206
+ "title": "Min Width",
207
+ "type": "integer"
208
+ },
209
+ "min_height": {
210
+ "default": 50,
211
+ "title": "Min Height",
212
+ "type": "integer"
213
+ },
214
+ "min_area": {
215
+ "default": 5000,
216
+ "title": "Min Area",
217
+ "type": "integer"
218
+ },
219
+ "deduplicate": {
220
+ "default": true,
221
+ "title": "Deduplicate",
222
+ "type": "boolean"
223
+ }
224
+ },
225
+ "title": "ImageFilterConfig",
226
+ "type": "object"
227
+ },
228
+ "JinaConfig": {
229
+ "description": "Jina Reader API configuration.",
230
+ "properties": {
231
+ "api_key": {
232
+ "anyOf": [
233
+ {
234
+ "type": "string"
235
+ },
236
+ {
237
+ "type": "null"
238
+ }
239
+ ],
240
+ "default": null,
241
+ "title": "Api Key"
242
+ },
243
+ "timeout": {
244
+ "default": 30,
245
+ "title": "Timeout",
246
+ "type": "integer"
247
+ }
248
+ },
249
+ "title": "JinaConfig",
250
+ "type": "object"
251
+ },
252
+ "LLMConfig": {
253
+ "description": "LLM configuration.",
254
+ "properties": {
255
+ "enabled": {
256
+ "default": false,
257
+ "title": "Enabled",
258
+ "type": "boolean"
259
+ },
260
+ "model_list": {
261
+ "items": {
262
+ "$ref": "#/$defs/ModelConfig"
263
+ },
264
+ "title": "Model List",
265
+ "type": "array"
266
+ },
267
+ "router_settings": {
268
+ "$ref": "#/$defs/RouterSettings"
269
+ },
270
+ "concurrency": {
271
+ "default": 10,
272
+ "title": "Concurrency",
273
+ "type": "integer"
274
+ }
275
+ },
276
+ "title": "LLMConfig",
277
+ "type": "object"
278
+ },
279
+ "LiteLLMParams": {
280
+ "description": "LiteLLM parameters for a model.",
281
+ "properties": {
282
+ "model": {
283
+ "title": "Model",
284
+ "type": "string"
285
+ },
286
+ "api_key": {
287
+ "anyOf": [
288
+ {
289
+ "type": "string"
290
+ },
291
+ {
292
+ "type": "null"
293
+ }
294
+ ],
295
+ "default": null,
296
+ "title": "Api Key"
297
+ },
298
+ "api_base": {
299
+ "anyOf": [
300
+ {
301
+ "type": "string"
302
+ },
303
+ {
304
+ "type": "null"
305
+ }
306
+ ],
307
+ "default": null,
308
+ "title": "Api Base"
309
+ },
310
+ "weight": {
311
+ "default": 1,
312
+ "title": "Weight",
313
+ "type": "integer"
314
+ },
315
+ "max_tokens": {
316
+ "anyOf": [
317
+ {
318
+ "type": "integer"
319
+ },
320
+ {
321
+ "type": "null"
322
+ }
323
+ ],
324
+ "default": null,
325
+ "title": "Max Tokens"
326
+ }
327
+ },
328
+ "required": [
329
+ "model"
330
+ ],
331
+ "title": "LiteLLMParams",
332
+ "type": "object"
333
+ },
334
+ "LogConfig": {
335
+ "description": "Logging configuration.",
336
+ "properties": {
337
+ "level": {
338
+ "default": "INFO",
339
+ "enum": [
340
+ "DEBUG",
341
+ "INFO",
342
+ "WARNING",
343
+ "ERROR",
344
+ "CRITICAL"
345
+ ],
346
+ "title": "Level",
347
+ "type": "string"
348
+ },
349
+ "dir": {
350
+ "anyOf": [
351
+ {
352
+ "type": "string"
353
+ },
354
+ {
355
+ "type": "null"
356
+ }
357
+ ],
358
+ "default": "~/.markitai/logs",
359
+ "title": "Dir"
360
+ },
361
+ "rotation": {
362
+ "default": "10 MB",
363
+ "title": "Rotation",
364
+ "type": "string"
365
+ },
366
+ "retention": {
367
+ "default": "7 days",
368
+ "title": "Retention",
369
+ "type": "string"
370
+ }
371
+ },
372
+ "title": "LogConfig",
373
+ "type": "object"
374
+ },
375
+ "ModelConfig": {
376
+ "description": "Model configuration for LiteLLM Router.",
377
+ "properties": {
378
+ "model_name": {
379
+ "title": "Model Name",
380
+ "type": "string"
381
+ },
382
+ "litellm_params": {
383
+ "$ref": "#/$defs/LiteLLMParams"
384
+ },
385
+ "model_info": {
386
+ "anyOf": [
387
+ {
388
+ "$ref": "#/$defs/ModelInfo"
389
+ },
390
+ {
391
+ "type": "null"
392
+ }
393
+ ],
394
+ "default": null
395
+ }
396
+ },
397
+ "required": [
398
+ "model_name",
399
+ "litellm_params"
400
+ ],
401
+ "title": "ModelConfig",
402
+ "type": "object"
403
+ },
404
+ "ModelInfo": {
405
+ "description": "Model metadata.",
406
+ "properties": {
407
+ "supports_vision": {
408
+ "anyOf": [
409
+ {
410
+ "type": "boolean"
411
+ },
412
+ {
413
+ "type": "null"
414
+ }
415
+ ],
416
+ "default": null,
417
+ "description": "Optional. If not set, auto-detected from litellm.",
418
+ "title": "Supports Vision"
419
+ },
420
+ "max_tokens": {
421
+ "anyOf": [
422
+ {
423
+ "type": "integer"
424
+ },
425
+ {
426
+ "type": "null"
427
+ }
428
+ ],
429
+ "default": null,
430
+ "description": "Optional. If not set, auto-detected from litellm.",
431
+ "title": "Max Tokens"
432
+ },
433
+ "max_input_tokens": {
434
+ "anyOf": [
435
+ {
436
+ "type": "integer"
437
+ },
438
+ {
439
+ "type": "null"
440
+ }
441
+ ],
442
+ "default": null,
443
+ "description": "Optional. If not set, auto-detected from litellm.",
444
+ "title": "Max Input Tokens"
445
+ }
446
+ },
447
+ "description": "Model metadata. All fields are optional and auto-detected from litellm if not set.",
448
+ "title": "ModelInfo",
449
+ "type": "object"
450
+ },
451
+ "OCRConfig": {
452
+ "description": "OCR configuration.",
453
+ "properties": {
454
+ "enabled": {
455
+ "default": false,
456
+ "title": "Enabled",
457
+ "type": "boolean"
458
+ },
459
+ "lang": {
460
+ "default": "en",
461
+ "title": "Lang",
462
+ "type": "string"
463
+ }
464
+ },
465
+ "title": "OCRConfig",
466
+ "type": "object"
467
+ },
468
+ "OutputConfig": {
469
+ "description": "Output configuration.",
470
+ "properties": {
471
+ "dir": {
472
+ "default": "./output",
473
+ "title": "Dir",
474
+ "type": "string"
475
+ },
476
+ "on_conflict": {
477
+ "default": "rename",
478
+ "enum": [
479
+ "skip",
480
+ "overwrite",
481
+ "rename"
482
+ ],
483
+ "title": "On Conflict",
484
+ "type": "string"
485
+ },
486
+ "allow_symlinks": {
487
+ "default": false,
488
+ "title": "Allow Symlinks",
489
+ "type": "boolean"
490
+ }
491
+ },
492
+ "title": "OutputConfig",
493
+ "type": "object"
494
+ },
495
+ "PresetConfig": {
496
+ "description": "Preset configuration defining which features to enable.",
497
+ "properties": {
498
+ "llm": {
499
+ "default": false,
500
+ "title": "Llm",
501
+ "type": "boolean"
502
+ },
503
+ "ocr": {
504
+ "default": false,
505
+ "title": "Ocr",
506
+ "type": "boolean"
507
+ },
508
+ "alt": {
509
+ "default": false,
510
+ "title": "Alt",
511
+ "type": "boolean"
512
+ },
513
+ "desc": {
514
+ "default": false,
515
+ "title": "Desc",
516
+ "type": "boolean"
517
+ },
518
+ "screenshot": {
519
+ "default": false,
520
+ "title": "Screenshot",
521
+ "type": "boolean"
522
+ }
523
+ },
524
+ "title": "PresetConfig",
525
+ "type": "object"
526
+ },
527
+ "PromptsConfig": {
528
+ "description": "Prompts configuration.",
529
+ "properties": {
530
+ "dir": {
531
+ "default": "~/.markitai/prompts",
532
+ "title": "Dir",
533
+ "type": "string"
534
+ },
535
+ "cleaner": {
536
+ "anyOf": [
537
+ {
538
+ "type": "string"
539
+ },
540
+ {
541
+ "type": "null"
542
+ }
543
+ ],
544
+ "default": null,
545
+ "title": "Cleaner"
546
+ },
547
+ "frontmatter": {
548
+ "anyOf": [
549
+ {
550
+ "type": "string"
551
+ },
552
+ {
553
+ "type": "null"
554
+ }
555
+ ],
556
+ "default": null,
557
+ "title": "Frontmatter"
558
+ },
559
+ "image_caption": {
560
+ "anyOf": [
561
+ {
562
+ "type": "string"
563
+ },
564
+ {
565
+ "type": "null"
566
+ }
567
+ ],
568
+ "default": null,
569
+ "title": "Image Caption"
570
+ },
571
+ "image_description": {
572
+ "anyOf": [
573
+ {
574
+ "type": "string"
575
+ },
576
+ {
577
+ "type": "null"
578
+ }
579
+ ],
580
+ "default": null,
581
+ "title": "Image Description"
582
+ },
583
+ "image_analysis": {
584
+ "anyOf": [
585
+ {
586
+ "type": "string"
587
+ },
588
+ {
589
+ "type": "null"
590
+ }
591
+ ],
592
+ "default": null,
593
+ "title": "Image Analysis"
594
+ },
595
+ "page_content": {
596
+ "anyOf": [
597
+ {
598
+ "type": "string"
599
+ },
600
+ {
601
+ "type": "null"
602
+ }
603
+ ],
604
+ "default": null,
605
+ "title": "Page Content"
606
+ },
607
+ "document_enhance": {
608
+ "anyOf": [
609
+ {
610
+ "type": "string"
611
+ },
612
+ {
613
+ "type": "null"
614
+ }
615
+ ],
616
+ "default": null,
617
+ "title": "Document Enhance"
618
+ },
619
+ "url_enhance": {
620
+ "anyOf": [
621
+ {
622
+ "type": "string"
623
+ },
624
+ {
625
+ "type": "null"
626
+ }
627
+ ],
628
+ "default": null,
629
+ "title": "Url Enhance"
630
+ }
631
+ },
632
+ "title": "PromptsConfig",
633
+ "type": "object"
634
+ },
635
+ "RouterSettings": {
636
+ "description": "LiteLLM Router settings.",
637
+ "properties": {
638
+ "routing_strategy": {
639
+ "default": "simple-shuffle",
640
+ "enum": [
641
+ "simple-shuffle",
642
+ "least-busy",
643
+ "usage-based-routing",
644
+ "latency-based-routing"
645
+ ],
646
+ "title": "Routing Strategy",
647
+ "type": "string"
648
+ },
649
+ "num_retries": {
650
+ "default": 2,
651
+ "title": "Num Retries",
652
+ "type": "integer"
653
+ },
654
+ "timeout": {
655
+ "default": 120,
656
+ "title": "Timeout",
657
+ "type": "integer"
658
+ },
659
+ "fallbacks": {
660
+ "items": {
661
+ "additionalProperties": true,
662
+ "type": "object"
663
+ },
664
+ "title": "Fallbacks",
665
+ "type": "array"
666
+ }
667
+ },
668
+ "title": "RouterSettings",
669
+ "type": "object"
670
+ },
671
+ "ScreenshotConfig": {
672
+ "description": "Screenshot rendering configuration.\n\nFor PDF/PPTX: Renders pages as JPEG images.\nFor URLs: Captures full-page screenshots using agent-browser.",
673
+ "properties": {
674
+ "enabled": {
675
+ "default": false,
676
+ "title": "Enabled",
677
+ "type": "boolean"
678
+ },
679
+ "viewport_width": {
680
+ "default": 1920,
681
+ "title": "Viewport Width",
682
+ "type": "integer"
683
+ },
684
+ "viewport_height": {
685
+ "default": 1080,
686
+ "title": "Viewport Height",
687
+ "type": "integer"
688
+ },
689
+ "quality": {
690
+ "default": 75,
691
+ "maximum": 100,
692
+ "minimum": 1,
693
+ "title": "Quality",
694
+ "type": "integer"
695
+ },
696
+ "max_height": {
697
+ "default": 10000,
698
+ "title": "Max Height",
699
+ "type": "integer"
700
+ }
701
+ },
702
+ "title": "ScreenshotConfig",
703
+ "type": "object"
704
+ }
705
+ },
706
+ "description": "Main configuration model.",
707
+ "properties": {
708
+ "output": {
709
+ "$ref": "#/$defs/OutputConfig"
710
+ },
711
+ "llm": {
712
+ "$ref": "#/$defs/LLMConfig"
713
+ },
714
+ "image": {
715
+ "$ref": "#/$defs/ImageConfig"
716
+ },
717
+ "ocr": {
718
+ "$ref": "#/$defs/OCRConfig"
719
+ },
720
+ "screenshot": {
721
+ "$ref": "#/$defs/ScreenshotConfig"
722
+ },
723
+ "prompts": {
724
+ "$ref": "#/$defs/PromptsConfig"
725
+ },
726
+ "batch": {
727
+ "$ref": "#/$defs/BatchConfig"
728
+ },
729
+ "log": {
730
+ "$ref": "#/$defs/LogConfig"
731
+ },
732
+ "cache": {
733
+ "$ref": "#/$defs/CacheConfig"
734
+ },
735
+ "fetch": {
736
+ "$ref": "#/$defs/FetchConfig"
737
+ },
738
+ "presets": {
739
+ "additionalProperties": {
740
+ "$ref": "#/$defs/PresetConfig"
741
+ },
742
+ "title": "Presets",
743
+ "type": "object"
744
+ }
745
+ },
746
+ "title": "MarkitaiConfig",
747
+ "type": "object"
748
+ }