repulp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repulp/__init__.py ADDED
@@ -0,0 +1,172 @@
1
+ """repulp — Parallel batch document conversion, watch mode, and structured extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import Optional, Union
9
+
10
+ __version__ = "0.1.0"
11
+
12
+
13
+ def convert(
14
+ source: Union[str, Path],
15
+ clean: bool = True,
16
+ frontmatter: bool = False,
17
+ format: str = "md",
18
+ ) -> "ConversionResult":
19
+ """Convert a file, URL, or path to Markdown.
20
+
21
+ Args:
22
+ source: File path, URL (http/https), or "-" for stdin.
23
+ clean: Post-process the markdown output.
24
+ frontmatter: Inject YAML frontmatter with metadata.
25
+ format: Output format - "md", "text", or "json".
26
+
27
+ Returns:
28
+ ConversionResult with markdown content and metadata.
29
+ """
30
+ from repulp.converter import convert_file, convert_url, ConversionResult
31
+ from repulp.fetcher import is_url
32
+ from repulp.frontmatter import inject_frontmatter
33
+ from repulp.formatter import format_output
34
+
35
+ source_str = str(source)
36
+
37
+ if source_str == "-":
38
+ content = sys.stdin.buffer.read()
39
+ tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
40
+ tmp.write(content)
41
+ tmp.close()
42
+ try:
43
+ result = convert_file(Path(tmp.name), clean=clean)
44
+ result = ConversionResult(
45
+ source_path=Path("stdin"),
46
+ markdown=result.markdown,
47
+ success=result.success,
48
+ error=result.error,
49
+ )
50
+ finally:
51
+ Path(tmp.name).unlink(missing_ok=True)
52
+ elif is_url(source_str):
53
+ result = convert_url(source_str, clean=clean)
54
+ else:
55
+ result = convert_file(Path(source_str), clean=clean)
56
+
57
+ if result.success and frontmatter:
58
+ result = ConversionResult(
59
+ source_path=result.source_path,
60
+ markdown=inject_frontmatter(result.markdown, source_str),
61
+ success=True,
62
+ )
63
+
64
+ if result.success and format != "md":
65
+ result = ConversionResult(
66
+ source_path=result.source_path,
67
+ markdown=format_output(result.markdown, format, source_str),
68
+ success=True,
69
+ )
70
+
71
+ return result
72
+
73
+
74
+ def batch(
75
+ source: Union[str, Path],
76
+ output_dir: Optional[Union[str, Path]] = None,
77
+ workers: Optional[int] = None,
78
+ recursive: bool = False,
79
+ include: Optional[list[str]] = None,
80
+ exclude: Optional[list[str]] = None,
81
+ clean: bool = True,
82
+ incremental: bool = False,
83
+ ) -> "BatchResult":
84
+ """Convert all supported files in a directory using parallel workers.
85
+
86
+ Args:
87
+ source: Directory to scan.
88
+ output_dir: Output directory for .md files.
89
+ workers: Number of parallel workers. None = auto.
90
+ recursive: Scan subdirectories.
91
+ include: Glob patterns to include.
92
+ exclude: Glob patterns to exclude.
93
+ clean: Post-process markdown.
94
+ incremental: Skip unchanged files.
95
+
96
+ Returns:
97
+ BatchResult with conversion results and statistics.
98
+ """
99
+ from repulp.engine import batch_convert
100
+
101
+ return batch_convert(
102
+ Path(source),
103
+ workers=workers,
104
+ recursive=recursive,
105
+ include=include,
106
+ exclude=exclude,
107
+ clean=clean,
108
+ incremental=incremental,
109
+ )
110
+
111
+
112
+ def extract_tables(
113
+ source: Union[str, Path],
114
+ format: str = "dict",
115
+ clean: bool = True,
116
+ ) -> list:
117
+ """Extract tables from a document as structured data.
118
+
119
+ Args:
120
+ source: File path or URL.
121
+ format: "dict" (list[dict]), "csv" (strings), "dataframe" (pandas), "markdown" (raw).
122
+ clean: Post-process markdown before extraction.
123
+
124
+ Returns:
125
+ List of tables in the requested format.
126
+ """
127
+ from repulp.extractor import extract_tables_structured
128
+
129
+ result = convert(source, clean=clean)
130
+ if not result.success:
131
+ return []
132
+
133
+ return extract_tables_structured(result.markdown, format=format)
134
+
135
+
136
+ def watch(
137
+ source: Union[str, Path],
138
+ output_dir: Optional[Union[str, Path]] = None,
139
+ recursive: bool = True,
140
+ include: Optional[list[str]] = None,
141
+ exclude: Optional[list[str]] = None,
142
+ clean: bool = True,
143
+ debounce_ms: int = 500,
144
+ on_change=None,
145
+ on_command: Optional[str] = None,
146
+ ) -> None:
147
+ """Watch a directory and auto-convert files on change.
148
+
149
+ Args:
150
+ source: Directory to watch.
151
+ output_dir: Where to write .md files.
152
+ recursive: Watch subdirectories.
153
+ include: Glob patterns to include.
154
+ exclude: Glob patterns to exclude.
155
+ clean: Post-process markdown.
156
+ debounce_ms: Debounce interval in milliseconds.
157
+ on_change: Callback(WatchEvent) after each conversion.
158
+ on_command: Shell command to run after each conversion.
159
+ """
160
+ from repulp.watcher import watch_directory
161
+
162
+ watch_directory(
163
+ Path(source),
164
+ output_dir=Path(output_dir) if output_dir else None,
165
+ recursive=recursive,
166
+ include=include,
167
+ exclude=exclude,
168
+ clean=clean,
169
+ debounce_ms=debounce_ms,
170
+ on_change=on_change,
171
+ on_command=on_command,
172
+ )
repulp/cache.py ADDED
@@ -0,0 +1,88 @@
1
+ """Incremental build cache for repulp.
2
+
3
+ Tracks SHA256 hashes of source files so that batch_convert() can skip
4
+ files that have not changed since the last successful conversion.
5
+ The cache is persisted as a JSON file (typically `.repulp.cache` inside
6
+ the source directory).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+
17
+ class ConversionCache:
18
+ """File-hash cache that detects which source files have changed.
19
+
20
+ Keys are resolved absolute path strings; values are hex-encoded SHA256
21
+ digests of the file contents at the time of last successful conversion.
22
+ """
23
+
24
+ def __init__(self, cache_path: Path) -> None:
25
+ self._cache_path = Path(cache_path)
26
+ self._hashes: dict[str, str] = {}
27
+
28
+ if self._cache_path.exists():
29
+ try:
30
+ self._hashes = json.loads(self._cache_path.read_text(encoding="utf-8"))
31
+ except (json.JSONDecodeError, OSError):
32
+ self._hashes = {}
33
+
34
+ @staticmethod
35
+ def _hash_file(path: Path) -> Optional[str]:
36
+ """Return the SHA256 hex digest of *path*'s contents, or None if unreadable."""
37
+ try:
38
+ data = Path(path).resolve().read_bytes()
39
+ return hashlib.sha256(data).hexdigest()
40
+ except (OSError, PermissionError):
41
+ return None
42
+
43
+ def is_changed(self, path: Path) -> bool:
44
+ """Return True if *path* has changed since the last cached conversion.
45
+
46
+ A file is considered changed when:
47
+ - It has never been cached before.
48
+ - Its current SHA256 hash differs from the stored hash.
49
+ - The file no longer exists (hash returns None).
50
+ """
51
+ key = str(Path(path).resolve())
52
+ current_hash = self._hash_file(path)
53
+ if current_hash is None:
54
+ return True
55
+ return self._hashes.get(key) != current_hash
56
+
57
+ def mark_converted(self, path: Path) -> None:
58
+ """Record the current SHA256 hash for *path* as the latest converted state."""
59
+ key = str(Path(path).resolve())
60
+ file_hash = self._hash_file(path)
61
+ if file_hash is not None:
62
+ self._hashes[key] = file_hash
63
+
64
+ def partition(self, files: list[Path]) -> tuple[list[Path], list[Path]]:
65
+ """Split *files* into (changed, unchanged) based on cached hashes.
66
+
67
+ Args:
68
+ files: List of file paths to check.
69
+
70
+ Returns:
71
+ A 2-tuple of (changed_files, unchanged_files).
72
+ """
73
+ changed: list[Path] = []
74
+ unchanged: list[Path] = []
75
+ for f in files:
76
+ if self.is_changed(f):
77
+ changed.append(f)
78
+ else:
79
+ unchanged.append(f)
80
+ return changed, unchanged
81
+
82
+ def save(self) -> None:
83
+ """Persist the hash cache to disk as JSON."""
84
+ self._cache_path.parent.mkdir(parents=True, exist_ok=True)
85
+ self._cache_path.write_text(
86
+ json.dumps(self._hashes, indent=2, sort_keys=True) + "\n",
87
+ encoding="utf-8",
88
+ )
repulp/cleaner.py ADDED
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ def _normalize_blank_lines(text: str) -> str:
7
+ return re.sub(r"\n{3,}", "\n\n", text)
8
+
9
+
10
+ def _strip_trailing_whitespace(text: str) -> str:
11
+ return "\n".join(line.rstrip() for line in text.split("\n"))
12
+
13
+
14
+ def _ensure_heading_spacing(text: str) -> str:
15
+ lines = text.split("\n")
16
+ result: list[str] = []
17
+ for i, line in enumerate(lines):
18
+ is_heading = bool(re.match(r"^#{1,6}\s", line))
19
+ if is_heading and i > 0 and result and result[-1] != "":
20
+ result.append("")
21
+ result.append(line)
22
+ if is_heading and i < len(lines) - 1 and lines[i + 1] != "":
23
+ result.append("")
24
+ return "\n".join(result)
25
+
26
+
27
+ def _fix_table_alignment(text: str) -> str:
28
+ lines = text.split("\n")
29
+ i = 0
30
+ result_lines: list[str] = []
31
+
32
+ while i < len(lines):
33
+ if "|" in lines[i] and i + 1 < len(lines) and re.match(
34
+ r"^\|[\s\-:|]+\|$", lines[i + 1].strip()
35
+ ):
36
+ table_lines: list[str] = []
37
+ while i < len(lines) and "|" in lines[i]:
38
+ table_lines.append(lines[i])
39
+ i += 1
40
+ result_lines.extend(_format_table(table_lines))
41
+ else:
42
+ result_lines.append(lines[i])
43
+ i += 1
44
+
45
+ return "\n".join(result_lines)
46
+
47
+
48
+ def _format_table(table_lines: list[str]) -> list[str]:
49
+ rows: list[list[str]] = []
50
+ separator_idx = 1
51
+
52
+ for line in table_lines:
53
+ cells = [c.strip() for c in line.strip().strip("|").split("|")]
54
+ rows.append(cells)
55
+
56
+ if len(rows) < 2:
57
+ return table_lines
58
+
59
+ num_cols = max(len(row) for row in rows)
60
+ for row in rows:
61
+ while len(row) < num_cols:
62
+ row.append("")
63
+
64
+ col_widths = [
65
+ max(len(rows[r][c]) for r in range(len(rows)) if r != separator_idx)
66
+ for c in range(num_cols)
67
+ ]
68
+ col_widths = [max(w, 3) for w in col_widths]
69
+
70
+ formatted: list[str] = []
71
+ for r, row in enumerate(rows):
72
+ if r == separator_idx:
73
+ cells = ["-" * w for w in col_widths]
74
+ formatted.append("| " + " | ".join(cells) + " |")
75
+ else:
76
+ cells = [row[c].ljust(col_widths[c]) for c in range(num_cols)]
77
+ formatted.append("| " + " | ".join(cells) + " |")
78
+
79
+ return formatted
80
+
81
+
82
+ def _strip_artifacts(text: str) -> str:
83
+ text = text.replace("\x0c", "\n")
84
+ text = text.replace("\x00", "")
85
+ text = re.sub(r"[\x01-\x08\x0b\x0e-\x1f\x7f]", "", text)
86
+ return text
87
+
88
+
89
+ def clean_markdown(text: str) -> str:
90
+ if not text:
91
+ return ""
92
+
93
+ text = _strip_artifacts(text)
94
+ text = _strip_trailing_whitespace(text)
95
+ text = _ensure_heading_spacing(text)
96
+ text = _normalize_blank_lines(text)
97
+ text = _fix_table_alignment(text)
98
+ text = text.strip()
99
+ return text