markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,81 @@
1
+ """Path utilities for directory management.
2
+
3
+ This module provides helper functions for creating and managing
4
+ output directories used throughout markitai.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+
12
+ def ensure_dir(path: Path) -> Path:
13
+ """Ensure a directory exists, creating it if necessary.
14
+
15
+ Args:
16
+ path: Directory path to ensure exists
17
+
18
+ Returns:
19
+ The same path (for chaining)
20
+
21
+ Examples:
22
+ >>> ensure_dir(Path("/tmp/output"))
23
+ PosixPath('/tmp/output')
24
+ """
25
+ path.mkdir(parents=True, exist_ok=True)
26
+ return path
27
+
28
+
29
+ def ensure_subdir(parent: Path, name: str) -> Path:
30
+ """Ensure a subdirectory exists under the parent directory.
31
+
32
+ Args:
33
+ parent: Parent directory path
34
+ name: Subdirectory name
35
+
36
+ Returns:
37
+ Path to the created subdirectory
38
+
39
+ Examples:
40
+ >>> ensure_subdir(Path("/tmp/output"), "assets")
41
+ PosixPath('/tmp/output/assets')
42
+ """
43
+ subdir = parent / name
44
+ subdir.mkdir(parents=True, exist_ok=True)
45
+ return subdir
46
+
47
+
48
+ def ensure_assets_dir(output_dir: Path) -> Path:
49
+ """Ensure the assets subdirectory exists.
50
+
51
+ This is a convenience wrapper for ensure_subdir(output_dir, "assets").
52
+
53
+ Args:
54
+ output_dir: Output directory path
55
+
56
+ Returns:
57
+ Path to the assets directory
58
+
59
+ Examples:
60
+ >>> ensure_assets_dir(Path("/tmp/output"))
61
+ PosixPath('/tmp/output/assets')
62
+ """
63
+ return ensure_subdir(output_dir, "assets")
64
+
65
+
66
+ def ensure_screenshots_dir(output_dir: Path) -> Path:
67
+ """Ensure the screenshots subdirectory exists.
68
+
69
+ This is a convenience wrapper for ensure_subdir(output_dir, "screenshots").
70
+
71
+ Args:
72
+ output_dir: Output directory path
73
+
74
+ Returns:
75
+ Path to the screenshots directory
76
+
77
+ Examples:
78
+ >>> ensure_screenshots_dir(Path("/tmp/output"))
79
+ PosixPath('/tmp/output/screenshots')
80
+ """
81
+ return ensure_subdir(output_dir, "screenshots")
markitai/utils/text.py ADDED
@@ -0,0 +1,359 @@
1
+ """Text processing utilities for Markitai."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+
8
+ def clean_residual_placeholders(content: str) -> str:
9
+ """Remove residual MARKITAI placeholders from content.
10
+
11
+ Some placeholders may leak into the output, especially in image references
12
+ like `![](__MARKITAI_FILE_ASSET__)`. This function cleans them up.
13
+
14
+ Args:
15
+ content: Markdown content with potential residual placeholders
16
+
17
+ Returns:
18
+ Cleaned content
19
+ """
20
+ # Remove standalone placeholder lines
21
+ content = re.sub(r"^__MARKITAI_[A-Z_]+_?\d*__\s*$", "", content, flags=re.MULTILINE)
22
+
23
+ # Remove image references with placeholder URLs
24
+ content = re.sub(r"!\[[^\]]*\]\(__MARKITAI_[A-Z_]+_?\d*__\)\s*\n?", "", content)
25
+
26
+ # Remove any other inline placeholders
27
+ content = re.sub(r"__MARKITAI_[A-Z_]+_?\d*__", "", content)
28
+
29
+ return content
30
+
31
+
32
+ def normalize_markdown_whitespace(content: str) -> str:
33
+ """Normalize whitespace in markdown content.
34
+
35
+ - Ensure headers (#) have one blank line before and after
36
+ - Merge 3+ consecutive blank lines into 2 blank lines
37
+ - Ensure consistent line endings
38
+ - Strip trailing whitespace from lines
39
+
40
+ Note: Header normalization is markdown-aware and correctly handles
41
+ nested code blocks (e.g., ```` containing ```).
42
+
43
+ Args:
44
+ content: Markdown content to normalize
45
+
46
+ Returns:
47
+ Normalized content
48
+ """
49
+ # Strip trailing whitespace from each line
50
+ lines = [line.rstrip() for line in content.split("\n")]
51
+
52
+ # Normalize header spacing (markdown-aware, skip code blocks)
53
+ result_lines: list[str] = []
54
+ code_block_char: str | None = None # '`' or '~'
55
+ code_block_count: int = 0 # Number of fence chars that opened the block
56
+
57
+ for i, line in enumerate(lines):
58
+ # Check for code fence (``` or ~~~, possibly more)
59
+ fence_match = re.match(r"^(`{3,}|~{3,})", line)
60
+ if fence_match:
61
+ fence = fence_match.group(1)
62
+ fence_char = fence[0]
63
+ fence_count = len(fence)
64
+
65
+ if code_block_char is None:
66
+ # Start of code block
67
+ code_block_char = fence_char
68
+ code_block_count = fence_count
69
+ elif fence_char == code_block_char and fence_count >= code_block_count:
70
+ # End of code block (same char, count >= opening)
71
+ code_block_char = None
72
+ code_block_count = 0
73
+ # else: fence inside code block, ignore
74
+
75
+ # Only process headers and slide comments outside code blocks
76
+ in_code_block = code_block_char is not None
77
+
78
+ # ATX headers: 1-6 # followed by space or end of line
79
+ # Excludes: #hashtag, #123, #! (shebang)
80
+ is_atx_header = bool(re.match(r"^#{1,6}(\s|$)", line))
81
+
82
+ # Slide comments: <!-- Slide number: X -->
83
+ is_slide_comment = bool(
84
+ re.match(r"^<!--\s*Slide\s+(number:\s*)?\d+\s*-->", line)
85
+ )
86
+
87
+ needs_spacing = is_atx_header or is_slide_comment
88
+
89
+ if not in_code_block and needs_spacing:
90
+ # Add blank line before if needed
91
+ if result_lines and result_lines[-1] != "":
92
+ result_lines.append("")
93
+ result_lines.append(line)
94
+ # Add blank line after if next line is not empty
95
+ if i + 1 < len(lines) and lines[i + 1] != "":
96
+ result_lines.append("")
97
+ else:
98
+ result_lines.append(line)
99
+
100
+ content = "\n".join(result_lines)
101
+
102
+ # Merge 3+ consecutive blank lines into 2 (keep one blank line between blocks)
103
+ content = re.sub(r"\n{3,}", "\n\n", content)
104
+
105
+ # Ensure single newline at end
106
+ return content.strip() + "\n"
107
+
108
+
109
+ def fix_broken_markdown_links(content: str) -> str:
110
+ """Fix broken markdown links where text and URL are split by newlines.
111
+
112
+ Common pattern from web scraping:
113
+ [Title text
114
+
115
+ Description text](/url)
116
+
117
+ Should become: [Title text](/url)
118
+
119
+ Args:
120
+ content: Markdown content with potentially broken links
121
+
122
+ Returns:
123
+ Content with fixed links
124
+ """
125
+ # Pattern: [text with newlines inside](url)
126
+ # Captures: [anything with newlines](url) and keeps only first line + url
127
+ pattern = r"\[([^\]]*?)\n+([^\]]*?)\]\(([^)]+)\)"
128
+
129
+ def fix_link(match: re.Match[str]) -> str:
130
+ first_part = match.group(1).strip()
131
+ url = match.group(3)
132
+ return f"[{first_part}]({url})"
133
+
134
+ # Apply fix iteratively until no more changes
135
+ prev_content = ""
136
+ while prev_content != content:
137
+ prev_content = content
138
+ content = re.sub(pattern, fix_link, content)
139
+
140
+ return content
141
+
142
+
143
+ def clean_ppt_headers_footers(content: str) -> str:
144
+ """Clean PPT/PDF headers and footers that appear at the end of each page/slide.
145
+
146
+ Pattern: Short lines (< 30 chars each) at the end of page blocks,
147
+ appearing repeatedly across multiple pages.
148
+
149
+ Example pattern to remove:
150
+ FTD
151
+ FREE TEST DATA
152
+ 2
153
+
154
+ Args:
155
+ content: Markdown content with potential headers/footers
156
+
157
+ Returns:
158
+ Cleaned content
159
+ """
160
+ # Split by page/slide markers
161
+ page_pattern = r"(<!-- (?:Page|Slide) (?:number: ?)?\d+ -->)"
162
+ parts = re.split(page_pattern, content)
163
+
164
+ if len(parts) < 3:
165
+ # Not enough pages to detect pattern
166
+ return content
167
+
168
+ # Analyze ending patterns for each page block
169
+ page_endings: list[list[str]] = []
170
+
171
+ for i, part in enumerate(parts):
172
+ if re.match(r"<!-- (?:Page|Slide)", part):
173
+ continue
174
+ # Get the content block after a page marker
175
+ if i > 0 and re.match(r"<!-- (?:Page|Slide)", parts[i - 1]):
176
+ # Extract last few lines (potential footer)
177
+ lines = [ln.strip() for ln in part.strip().split("\n") if ln.strip()]
178
+ if len(lines) >= 2:
179
+ # Take last 4 lines as potential footer
180
+ ending = lines[-4:] if len(lines) >= 4 else lines[-len(lines) :]
181
+ # Filter to short lines only (< 30 chars, not starting with # or !)
182
+ short_lines = [
183
+ ln
184
+ for ln in ending
185
+ if len(ln) < 30 and not ln.startswith(("#", "!", "[", "-", "*"))
186
+ ]
187
+ if short_lines:
188
+ page_endings.append(short_lines)
189
+
190
+ if len(page_endings) < 3:
191
+ return content
192
+
193
+ # Find common ending pattern (appears in >= 50% of pages)
194
+ from collections import Counter
195
+
196
+ # Count each unique ending line
197
+ all_ending_lines: list[str] = []
198
+ for ending in page_endings:
199
+ all_ending_lines.extend(ending)
200
+
201
+ line_counts = Counter(all_ending_lines)
202
+ threshold = len(page_endings) * 0.5
203
+
204
+ # Lines that appear frequently (excluding pure numbers which are page numbers)
205
+ common_lines = {
206
+ line
207
+ for line, count in line_counts.items()
208
+ if count >= threshold and not line.isdigit()
209
+ }
210
+
211
+ if not common_lines:
212
+ return content
213
+
214
+ # Remove common footer lines from content
215
+ # Also remove adjacent page numbers (single digit or 2-digit numbers)
216
+ result_lines = []
217
+ lines = content.split("\n")
218
+
219
+ i = 0
220
+ while i < len(lines):
221
+ line = lines[i].strip()
222
+ # Check if this line and nearby lines form a footer pattern
223
+ if line in common_lines:
224
+ # Skip this line and check for adjacent page number
225
+ if i + 1 < len(lines) and lines[i + 1].strip().isdigit():
226
+ i += 2 # Skip both
227
+ else:
228
+ i += 1
229
+ elif line.isdigit() and i > 0:
230
+ # Check if previous line was a common footer line
231
+ prev_line = lines[i - 1].strip() if i > 0 else ""
232
+ if prev_line in common_lines or (
233
+ i >= 2 and lines[i - 2].strip() in common_lines
234
+ ):
235
+ i += 1 # Skip page number
236
+ else:
237
+ result_lines.append(lines[i])
238
+ i += 1
239
+ else:
240
+ result_lines.append(lines[i])
241
+ i += 1
242
+
243
+ return "\n".join(result_lines)
244
+
245
+
246
+ def dedupe_paragraphs(content: str, min_length: int = 100) -> str:
247
+ """Remove duplicate paragraphs from content.
248
+
249
+ Useful for cleaning browser-fetched content where the same text
250
+ may appear multiple times (e.g., Twitter/X pages with repeated
251
+ content in og:title, aria-label, and main content).
252
+
253
+ Args:
254
+ content: Markdown content with potential duplicate paragraphs
255
+ min_length: Minimum paragraph length to consider for deduplication.
256
+ Shorter paragraphs are always kept (to avoid removing headers, etc.)
257
+
258
+ Returns:
259
+ Content with duplicate paragraphs removed (first occurrence kept)
260
+ """
261
+ # Split by double newlines (paragraph separator)
262
+ paragraphs = re.split(r"\n\n+", content)
263
+
264
+ seen_paragraphs: set[str] = set()
265
+ result_paragraphs: list[str] = []
266
+
267
+ for para in paragraphs:
268
+ para_stripped = para.strip()
269
+
270
+ # Skip empty paragraphs
271
+ if not para_stripped:
272
+ continue
273
+
274
+ # Normalize for comparison: collapse whitespace
275
+ normalized = re.sub(r"\s+", " ", para_stripped)
276
+
277
+ # Short paragraphs: always keep (headers, short lines, etc.)
278
+ if len(normalized) < min_length:
279
+ result_paragraphs.append(para)
280
+ continue
281
+
282
+ # Long paragraphs: dedupe
283
+ if normalized not in seen_paragraphs:
284
+ seen_paragraphs.add(normalized)
285
+ result_paragraphs.append(para)
286
+ # else: skip duplicate
287
+
288
+ return "\n\n".join(result_paragraphs)
289
+
290
+
291
+ def dedupe_long_text_blocks(content: str, min_length: int = 50) -> str:
292
+ """Remove duplicate long text blocks from content.
293
+
294
+ More aggressive deduplication for social media content where the same
295
+ long text appears multiple times in different formatting contexts
296
+ (e.g., Twitter aria-label, og:title, and main content).
297
+
298
+ This function:
299
+ 1. Finds all "long text blocks" (continuous text >= min_length chars)
300
+ 2. Removes duplicate occurrences, keeping the first one
301
+ 3. Also handles cases where text is prefixed with usernames/metadata
302
+
303
+ Args:
304
+ content: Markdown content with potential duplicate text blocks
305
+ min_length: Minimum text length to consider for deduplication
306
+
307
+ Returns:
308
+ Content with duplicate text blocks removed
309
+ """
310
+ lines = content.split("\n")
311
+ result_lines: list[str] = []
312
+ seen_texts: list[str] = [] # Use list to preserve order for substring matching
313
+
314
+ for line in lines:
315
+ line_stripped = line.strip()
316
+
317
+ # Skip short lines, markdown syntax, or empty lines
318
+ if not line_stripped or len(line_stripped) < min_length:
319
+ result_lines.append(line)
320
+ continue
321
+
322
+ # Skip lines that are primarily markdown syntax
323
+ if line_stripped.startswith(("#", "!", "[", "|", "-", "*", ">", "```", "<!--")):
324
+ result_lines.append(line)
325
+ continue
326
+
327
+ # Normalize text for comparison
328
+ # Remove leading username patterns (Twitter/X format)
329
+ normalized = re.sub(r"^[A-Za-z\s]+@\w+\s+", "", line_stripped)
330
+ # Remove @ mentions at start
331
+ normalized = re.sub(r"^@\w+\s*", "", normalized)
332
+ # Remove timestamps and metrics
333
+ normalized = re.sub(r"\d+:\d+\s*(AM|PM|am|pm)?\s*·?\s*", "", normalized)
334
+ normalized = re.sub(
335
+ r"\d+\s*(replies|reposts|likes|views|bookmarks)[,\s]*",
336
+ "",
337
+ normalized,
338
+ flags=re.I,
339
+ )
340
+ normalized = re.sub(r"\s+", " ", normalized).strip()
341
+
342
+ # If still long enough after normalization
343
+ if len(normalized) >= min_length:
344
+ # Check if this text is a duplicate or substring of seen text
345
+ is_duplicate = False
346
+ for seen in seen_texts:
347
+ # Check both directions: new is in seen, or seen is in new
348
+ if normalized in seen or seen in normalized:
349
+ is_duplicate = True
350
+ break
351
+
352
+ if not is_duplicate:
353
+ seen_texts.append(normalized)
354
+ result_lines.append(line)
355
+ # else: skip duplicate
356
+ else:
357
+ result_lines.append(line)
358
+
359
+ return "\n".join(result_lines)
@@ -0,0 +1,37 @@
1
+ """Workflow module for document processing pipelines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from markitai.workflow.core import (
6
+ ConversionContext,
7
+ ConversionStepResult,
8
+ DocumentConversionError,
9
+ FileSizeError,
10
+ UnsupportedFormatError,
11
+ convert_document_core,
12
+ )
13
+ from markitai.workflow.helpers import (
14
+ add_basic_frontmatter,
15
+ detect_language,
16
+ merge_llm_usage,
17
+ write_images_json,
18
+ )
19
+ from markitai.workflow.single import ImageAnalysisResult, SingleFileWorkflow
20
+
21
+ __all__ = [
22
+ "ConversionContext",
23
+ "ConversionStepResult",
24
+ "DocumentConversionError",
25
+ "FileSizeError",
26
+ "ImageAnalysisResult",
27
+ "SingleFileWorkflow",
28
+ "UnsupportedFormatError",
29
+ "add_basic_frontmatter",
30
+ "convert_document_core",
31
+ "detect_language",
32
+ "merge_llm_usage",
33
+ "write_images_json",
34
+ ]
35
+
36
+ # Backward compatibility alias
37
+ write_assets_json = write_images_json