@kulapard/pi-caveman 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,213 +0,0 @@
1
- #!/usr/bin/env python3
2
- import re
3
- from collections import Counter
4
- from pathlib import Path
5
-
6
- URL_REGEX = re.compile(r"https?://[^\s)]+")
7
- FENCE_OPEN_REGEX = re.compile(r"^(\s{0,3})(`{3,}|~{3,})(.*)$")
8
- HEADING_REGEX = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
9
- BULLET_REGEX = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
10
-
11
- # crude but effective path detection
12
- # Requires either a path prefix (./ ../ / or drive letter) or a slash/backslash within the match
13
- PATH_REGEX = re.compile(r"(?:\./|\.\./|/|[A-Za-z]:\\)[\w\-/\\\.]+|[\w\-\.]+[/\\][\w\-/\\\.]+")
14
-
15
-
16
- class ValidationResult:
17
- def __init__(self):
18
- self.is_valid = True
19
- self.errors = []
20
- self.warnings = []
21
-
22
- def add_error(self, msg):
23
- self.is_valid = False
24
- self.errors.append(msg)
25
-
26
- def add_warning(self, msg):
27
- self.warnings.append(msg)
28
-
29
-
30
- def read_file(path: Path) -> str:
31
- return path.read_text(errors="ignore")
32
-
33
-
34
- # ---------- Extractors ----------
35
-
36
-
37
- def extract_headings(text):
38
- return [(level, title.strip()) for level, title in HEADING_REGEX.findall(text)]
39
-
40
-
41
- def extract_code_blocks(text):
42
- """Line-based fenced code block extractor.
43
-
44
- Handles ``` and ~~~ fences with variable length (CommonMark: closing
45
- fence must use same char and be at least as long as opening). Supports
46
- nested fences (e.g. an outer 4-backtick block wrapping inner 3-backtick
47
- content).
48
- """
49
- blocks = []
50
- lines = text.split("\n")
51
- i = 0
52
- n = len(lines)
53
- while i < n:
54
- m = FENCE_OPEN_REGEX.match(lines[i])
55
- if not m:
56
- i += 1
57
- continue
58
- fence_char = m.group(2)[0]
59
- fence_len = len(m.group(2))
60
- open_line = lines[i]
61
- block_lines = [open_line]
62
- i += 1
63
- closed = False
64
- while i < n:
65
- close_m = FENCE_OPEN_REGEX.match(lines[i])
66
- if (
67
- close_m
68
- and close_m.group(2)[0] == fence_char
69
- and len(close_m.group(2)) >= fence_len
70
- and close_m.group(3).strip() == ""
71
- ):
72
- block_lines.append(lines[i])
73
- closed = True
74
- i += 1
75
- break
76
- block_lines.append(lines[i])
77
- i += 1
78
- if closed:
79
- blocks.append("\n".join(block_lines))
80
- # Unclosed fences are silently skipped — they indicate malformed markdown
81
- # and including them would cause false-positive validation failures.
82
- return blocks
83
-
84
-
85
- def extract_urls(text):
86
- return set(URL_REGEX.findall(text))
87
-
88
-
89
- def extract_paths(text):
90
- return set(PATH_REGEX.findall(text))
91
-
92
-
93
- def count_bullets(text):
94
- return len(BULLET_REGEX.findall(text))
95
-
96
-
97
- def extract_inline_codes(text):
98
- text_without_fences = re.sub(r"^```[\s\S]*?^```", "", text, flags=re.MULTILINE)
99
- text_without_fences = re.sub(r"^~~~[\s\S]*?^~~~", "", text_without_fences, flags=re.MULTILINE)
100
- return re.findall(r"`([^`]+)`", text_without_fences)
101
-
102
-
103
- # ---------- Validators ----------
104
-
105
-
106
- def validate_headings(orig, comp, result):
107
- h1 = extract_headings(orig)
108
- h2 = extract_headings(comp)
109
-
110
- if len(h1) != len(h2):
111
- result.add_error(f"Heading count mismatch: {len(h1)} vs {len(h2)}")
112
-
113
- if h1 != h2:
114
- result.add_warning("Heading text/order changed")
115
-
116
-
117
- def validate_code_blocks(orig, comp, result):
118
- c1 = extract_code_blocks(orig)
119
- c2 = extract_code_blocks(comp)
120
-
121
- if c1 != c2:
122
- result.add_error("Code blocks not preserved exactly")
123
-
124
-
125
- def validate_urls(orig, comp, result):
126
- u1 = extract_urls(orig)
127
- u2 = extract_urls(comp)
128
-
129
- if u1 != u2:
130
- result.add_error(f"URL mismatch: lost={u1 - u2}, added={u2 - u1}")
131
-
132
-
133
- def validate_paths(orig, comp, result):
134
- p1 = extract_paths(orig)
135
- p2 = extract_paths(comp)
136
-
137
- if p1 != p2:
138
- result.add_warning(f"Path mismatch: lost={p1 - p2}, added={p2 - p1}")
139
-
140
-
141
- def validate_bullets(orig, comp, result):
142
- b1 = count_bullets(orig)
143
- b2 = count_bullets(comp)
144
-
145
- if b1 == 0:
146
- return
147
-
148
- diff = abs(b1 - b2) / b1
149
-
150
- if diff > 0.15:
151
- result.add_warning(f"Bullet count changed too much: {b1} -> {b2}")
152
-
153
-
154
- def validate_inline_codes(orig, comp, result):
155
- c1 = Counter(extract_inline_codes(orig))
156
- c2 = Counter(extract_inline_codes(comp))
157
-
158
- if c1 != c2:
159
- lost = set(c1.keys()) - set(c2.keys())
160
- added = set(c2.keys()) - set(c1.keys())
161
- for code, count in c1.items():
162
- if code in c2 and c2[code] < count:
163
- lost.add(f"{code} (lost {count - c2[code]} of {count} occurrences)")
164
- if lost:
165
- result.add_error(f"Inline code lost: {lost}")
166
- if added:
167
- result.add_warning(f"Inline code added: {added}")
168
-
169
-
170
- # ---------- Main ----------
171
-
172
-
173
- def validate(original_path: Path, compressed_path: Path) -> ValidationResult:
174
- result = ValidationResult()
175
-
176
- orig = read_file(original_path)
177
- comp = read_file(compressed_path)
178
-
179
- validate_headings(orig, comp, result)
180
- validate_code_blocks(orig, comp, result)
181
- validate_urls(orig, comp, result)
182
- validate_paths(orig, comp, result)
183
- validate_bullets(orig, comp, result)
184
- validate_inline_codes(orig, comp, result)
185
-
186
- return result
187
-
188
-
189
- # ---------- CLI ----------
190
-
191
- if __name__ == "__main__":
192
- import sys
193
-
194
- if len(sys.argv) != 3:
195
- print("Usage: python validate.py <original> <compressed>")
196
- sys.exit(1)
197
-
198
- orig = Path(sys.argv[1]).resolve()
199
- comp = Path(sys.argv[2]).resolve()
200
-
201
- res = validate(orig, comp)
202
-
203
- print(f"\nValid: {res.is_valid}")
204
-
205
- if res.errors:
206
- print("\nErrors:")
207
- for e in res.errors:
208
- print(f" - {e}")
209
-
210
- if res.warnings:
211
- print("\nWarnings:")
212
- for w in res.warnings:
213
- print(f" - {w}")