ocrcontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,538 @@
1
+ """Google Cloud Vision handwriting engine.
2
+
3
+ Ported verbatim from ocr-service/vision_handwriting.py. Primary engine for
4
+ handwriting mode; TrOCR is the fallback. ``google-cloud-vision`` is imported
5
+ lazily (install the ``vision`` extra).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import re
13
+ from dataclasses import dataclass
14
+ from typing import Optional
15
+
16
+ from ..exceptions import MissingDependencyError
17
+
18
+ _DIKW_LETTERS = frozenset("WKID")
19
+
20
+
21
+ def _language_hints(ocr_lang: str) -> list[str]:
22
+ """Vision BCP-47 hints. Paddle uses 'latin' for Turkish; UI sends explicit codes."""
23
+ code = (ocr_lang or "").strip().lower()
24
+ if code in ("tr", "tur", "turkish", "latin", "auto", "unknown", ""):
25
+ return ["tr", "en"]
26
+ if code in ("en", "english"):
27
+ return ["en"]
28
+ return ["en", code]
29
+
30
+
31
+ @dataclass
32
+ class _WordBox:
33
+ text: str
34
+ cx: float
35
+ cy: float
36
+ x0: float
37
+ y0: float
38
+ x1: float
39
+ y1: float
40
+
41
+
42
+ _DIKW_MAP = {
43
+ "wisdom": ("W", "Wisdom"),
44
+ "knowledge": ("K", "Knowledge"),
45
+ "information": ("I", "Information"),
46
+ "data": ("D", "Data"),
47
+ }
48
+
49
+
50
+ def _vertices_box(vertices) -> tuple[float, float, float, float]:
51
+ xs = [float(v.x) for v in vertices]
52
+ ys = [float(v.y) for v in vertices]
53
+ return min(xs), min(ys), max(xs), max(ys)
54
+
55
+
56
+ def _word_text(word) -> str:
57
+ return "".join(s.text for s in word.symbols).strip()
58
+
59
+
60
+ def _collect_words(full_annotation) -> list[_WordBox]:
61
+ words: list[_WordBox] = []
62
+ if not full_annotation or not full_annotation.pages:
63
+ return words
64
+ for page in full_annotation.pages:
65
+ for block in page.blocks:
66
+ for paragraph in block.paragraphs:
67
+ for word in paragraph.words:
68
+ text = _word_text(word)
69
+ if not text:
70
+ continue
71
+ x0, y0, x1, y1 = _vertices_box(word.bounding_box.vertices)
72
+ words.append(
73
+ _WordBox(
74
+ text=text,
75
+ cx=(x0 + x1) / 2,
76
+ cy=(y0 + y1) / 2,
77
+ x0=x0,
78
+ y0=y0,
79
+ x1=x1,
80
+ y1=y1,
81
+ )
82
+ )
83
+ return words
84
+
85
+
86
+ def _row_tolerance(words: list[_WordBox]) -> float:
87
+ if not words:
88
+ return 20.0
89
+ heights = [w.y1 - w.y0 for w in words if w.y1 > w.y0]
90
+ if not heights:
91
+ return 20.0
92
+ heights.sort()
93
+ med = heights[len(heights) // 2]
94
+ return max(12.0, min(35.0, med * 0.75))
95
+
96
+
97
+ def _cluster_rows(words: list[_WordBox], tol: float) -> list[list[_WordBox]]:
98
+ if not words:
99
+ return []
100
+ sorted_words = sorted(words, key=lambda w: w.cy)
101
+ rows: list[list[_WordBox]] = []
102
+ for w in sorted_words:
103
+ placed = False
104
+ for row in rows:
105
+ row_cy = sum(x.cy for x in row) / len(row)
106
+ if abs(w.cy - row_cy) <= tol:
107
+ row.append(w)
108
+ placed = True
109
+ break
110
+ if not placed:
111
+ rows.append([w])
112
+ for row in rows:
113
+ row.sort(key=lambda w: w.x0)
114
+ rows.sort(key=lambda r: sum(w.cy for w in r) / len(r))
115
+ return rows
116
+
117
+
118
+ def _is_dikw_letter_token(token: str) -> bool:
119
+ """Single-letter DIKW side labels only (W, K, I, D) - not 'ne', 'to', etc."""
120
+ t = re.sub(r"[^a-zA-Z]", "", token.strip())
121
+ return len(t) == 1 and t.upper() in _DIKW_LETTERS
122
+
123
+
124
+ def _is_short_label(token: str) -> bool:
125
+ return _is_dikw_letter_token(token)
126
+
127
+
128
+ def _normalize_dikw_word(token: str) -> Optional[tuple[str, str]]:
129
+ key = re.sub(r"[^a-z]", "", token.lower())
130
+ return _DIKW_MAP.get(key)
131
+
132
+
133
+ def _row_has_dikw_pattern(tokens: list[str]) -> bool:
134
+ letters = [t for t in tokens if _is_dikw_letter_token(t)]
135
+ longs = [
136
+ t for t in tokens if not _is_dikw_letter_token(t) and len(re.sub(r"\W", "", t)) > 1
137
+ ]
138
+ return bool(letters) and bool(longs) and len(tokens) <= 8
139
+
140
+
141
+ def _format_row_tokens(tokens: list[str]) -> str:
142
+ """Default: space-joined prose. DIKW letter+word merge only when pattern matches."""
143
+ if not tokens:
144
+ return ""
145
+ if len(tokens) == 1:
146
+ return tokens[0]
147
+ if not _row_has_dikw_pattern(tokens):
148
+ return " ".join(tokens)
149
+
150
+ shorts = [t for t in tokens if _is_dikw_letter_token(t)]
151
+ longs = [t for t in tokens if not _is_dikw_letter_token(t)]
152
+
153
+ pairs: list[str] = []
154
+ used_long: set[int] = set()
155
+
156
+ for s in shorts:
157
+ letter = re.sub(r"[^a-zA-Z]", "", s).upper()
158
+ matched = False
159
+ for i, lng in enumerate(longs):
160
+ if i in used_long:
161
+ continue
162
+ mapped = _normalize_dikw_word(lng)
163
+ if mapped and mapped[0] == letter:
164
+ pairs.append(f"{mapped[1]} ({mapped[0]})")
165
+ used_long.add(i)
166
+ matched = True
167
+ break
168
+ if not matched and len(longs) == 1 and 0 not in used_long:
169
+ pairs.append(f"{longs[0]} ({letter})")
170
+ used_long.add(0)
171
+ elif not matched:
172
+ pairs.append(s)
173
+
174
+ for i, lng in enumerate(longs):
175
+ if i not in used_long:
176
+ pairs.append(lng)
177
+
178
+ return " · ".join(pairs) if len(pairs) > 1 else (pairs[0] if pairs else " ".join(tokens))
179
+
180
+
181
+ def _is_margin_number_line(line: str) -> bool:
182
+ s = line.strip()
183
+ if not s:
184
+ return True
185
+ if re.fullmatch(r"[\d\s]+", s):
186
+ return True
187
+ if re.fullmatch(r"\d{2,4}", s):
188
+ return True
189
+ return False
190
+
191
+
192
+ def _is_pyramid_header(line: str) -> bool:
193
+ low = line.lower()
194
+ return "piramid" in low or "pyramid" in low or "dikw" in low
195
+
196
+
197
+ def _row_looks_like_dikw_pair(line: str) -> bool:
198
+ tokens = re.split(r"\s+|[·]", line.replace("·", " "))
199
+ tokens = [t for t in tokens if t]
200
+ if len(tokens) < 2:
201
+ return False
202
+ has_letter = any(_is_dikw_letter_token(t) for t in tokens)
203
+ has_long = any(len(re.sub(r"\W", "", t)) > 2 for t in tokens)
204
+ return has_letter and has_long
205
+
206
+
207
+ def document_has_dikw_structure(lines: list[str]) -> bool:
208
+ """True when text looks like a DIKW / pyramid diagram (not plain prose)."""
209
+ pair_count = 0
210
+ for line in lines:
211
+ if _is_pyramid_header(line):
212
+ return True
213
+ if _row_looks_like_dikw_pair(line):
214
+ pair_count += 1
215
+ if pair_count >= 2:
216
+ return True
217
+ return False
218
+
219
+
220
+ def detect_dikw_structure(text: str) -> bool:
221
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
222
+ return document_has_dikw_structure(lines)
223
+
224
+
225
+ def _dedupe_consecutive_tokens(line: str) -> str:
226
+ tokens = line.split()
227
+ if len(tokens) < 2:
228
+ return line
229
+
230
+ def norm(t: str) -> str:
231
+ return re.sub(r"[^\w]", "", t.lower())
232
+
233
+ out = [tokens[0]]
234
+ for t in tokens[1:]:
235
+ if norm(t) != norm(out[-1]):
236
+ out.append(t)
237
+ return " ".join(out)
238
+
239
+
240
+ def dedupe_prose_lines(lines: list[str]) -> list[str]:
241
+ return [_dedupe_consecutive_tokens(ln) for ln in lines]
242
+
243
+
244
+ def _line_ends_complete(line: str) -> bool:
245
+ s = line.rstrip()
246
+ if not s:
247
+ return True
248
+ if s.endswith(("...", "…")):
249
+ return True
250
+ return s[-1] in ".!?;:"
251
+
252
+
253
+ def _line_starts_continuation(line: str) -> bool:
254
+ s = line.lstrip()
255
+ if not s:
256
+ return False
257
+ return s[0].islower()
258
+
259
+
260
+ def _looks_like_signature_line(line: str) -> bool:
261
+ words = [w for w in line.split() if w]
262
+ if len(words) < 2 or len(words) > 5:
263
+ return False
264
+ caps = sum(1 for w in words if w[0].isupper())
265
+ return caps >= 2 and not _line_starts_continuation(line)
266
+
267
+
268
+ def _capitalize_line_start(line: str) -> str:
269
+ for i, ch in enumerate(line):
270
+ if ch.isalpha():
271
+ return line[:i] + ch.upper() + line[i + 1:]
272
+ return line
273
+
274
+
275
+ def merge_wrapped_prose_lines(lines: list[str]) -> list[str]:
276
+ """Join Vision line breaks that split one sentence/verse across rows."""
277
+ cleaned = [ln.strip() for ln in lines if ln.strip()]
278
+ if len(cleaned) < 2:
279
+ return [_capitalize_line_start(ln) for ln in cleaned]
280
+
281
+ merged: list[str] = [cleaned[0]]
282
+ for nxt in cleaned[1:]:
283
+ prev = merged[-1]
284
+ should_merge = False
285
+
286
+ if _looks_like_signature_line(nxt) and _line_ends_complete(prev):
287
+ should_merge = False
288
+ elif (
289
+ _line_starts_continuation(nxt)
290
+ and not _line_ends_complete(prev)
291
+ and len(prev.split()) <= 3
292
+ ):
293
+ should_merge = True
294
+ elif len(prev.split()) <= 2 and not _line_ends_complete(prev):
295
+ should_merge = True
296
+
297
+ if should_merge:
298
+ merged[-1] = f"{prev} {nxt}"
299
+ else:
300
+ merged.append(nxt)
301
+
302
+ return [_capitalize_line_start(ln) for ln in merged]
303
+
304
+
305
+ def _format_dikw_hierarchy(header: str, pair_lines: list[str], side_notes: list[str]) -> str:
306
+ entries: list[tuple[int, str]] = []
307
+ order_key = {"W": 0, "K": 1, "I": 2, "D": 3}
308
+
309
+ for line in pair_lines:
310
+ tokens = [t for t in re.split(r"\s+|[·]", line.strip()) if t]
311
+ shorts = [t.upper() for t in tokens if _is_dikw_letter_token(t)]
312
+ longs = [t for t in tokens if not _is_dikw_letter_token(t)]
313
+ letter = shorts[0] if shorts else ""
314
+ label = longs[0] if longs else (shorts[0] if shorts else line)
315
+ mapped = _normalize_dikw_word(label)
316
+ if mapped:
317
+ letter, label = mapped
318
+ rank = order_key.get(letter, 99)
319
+ if letter == "W":
320
+ entries.append((rank, f"- {label} ({letter}) — en üst"))
321
+ elif letter == "D":
322
+ entries.append((rank, f"- {label} ({letter}) — taban"))
323
+ elif letter:
324
+ entries.append((rank, f"- {label} ({letter})"))
325
+ else:
326
+ entries.append((rank, f"- {line.strip()}"))
327
+
328
+ entries.sort(key=lambda x: x[0])
329
+ out = [header.rstrip(":") + ":"]
330
+ out.extend(e[1] for e in entries)
331
+ if side_notes:
332
+ note_parts = [n.strip() for n in side_notes if n.strip()]
333
+ if note_parts:
334
+ out.append("Not: " + " · ".join(note_parts))
335
+ return "\n".join(out)
336
+
337
+
338
+ def _restructure_document_lines(lines: list[str]) -> list[str]:
339
+ """Detect Bilgi Piramidi / DIKW blocks and emit hierarchical list."""
340
+ result: list[str] = []
341
+ i = 0
342
+ while i < len(lines):
343
+ line = lines[i]
344
+ if not _is_pyramid_header(line):
345
+ result.append(line)
346
+ i += 1
347
+ continue
348
+
349
+ header = line
350
+ i += 1
351
+ pair_lines: list[str] = []
352
+ side_notes: list[str] = []
353
+
354
+ while i < len(lines):
355
+ nxt = lines[i].strip()
356
+ if not nxt:
357
+ i += 1
358
+ break
359
+ low = nxt.lower()
360
+ if _is_pyramid_header(nxt) and pair_lines:
361
+ break
362
+ if len(nxt) > 80 and not _row_looks_like_dikw_pair(nxt):
363
+ break
364
+ if low in ("value", "meaning") or low.startswith("value") or low.startswith("meaning"):
365
+ if "value" in low:
366
+ side_notes.append("Value (↑) değer artar")
367
+ if "meaning" in low:
368
+ side_notes.append("Meaning (↓) anlam artar")
369
+ i += 1
370
+ continue
371
+ if (
372
+ _row_looks_like_dikw_pair(nxt)
373
+ or _is_dikw_letter_token(nxt)
374
+ or _normalize_dikw_word(nxt)
375
+ ):
376
+ pair_lines.append(nxt)
377
+ i += 1
378
+ continue
379
+ if len(pair_lines) >= 2:
380
+ break
381
+ pair_lines.append(nxt)
382
+ i += 1
383
+
384
+ if len(pair_lines) >= 2:
385
+ result.append(_format_dikw_hierarchy(header, pair_lines, side_notes))
386
+ else:
387
+ result.append(header)
388
+ result.extend(pair_lines)
389
+ result.extend(side_notes)
390
+
391
+ return result
392
+
393
+
394
+ def spatial_text_from_annotation(full_annotation) -> tuple[str, bool]:
395
+ """Build reading-order text using word bounding boxes. Returns (text, has_dikw)."""
396
+ words = _collect_words(full_annotation)
397
+ if not words:
398
+ return "", False
399
+
400
+ tol = _row_tolerance(words)
401
+ rows = _cluster_rows(words, tol)
402
+ lines: list[str] = []
403
+ for row in rows:
404
+ tokens = [w.text for w in row]
405
+ line = _format_row_tokens(tokens)
406
+ if line and not _is_margin_number_line(line):
407
+ lines.append(line)
408
+
409
+ if not lines:
410
+ return "", False
411
+
412
+ has_dikw = document_has_dikw_structure(lines)
413
+ if has_dikw:
414
+ lines = _restructure_document_lines(lines)
415
+ lines = dedupe_prose_lines(lines)
416
+ if not has_dikw:
417
+ lines = merge_wrapped_prose_lines(lines)
418
+ return "\n".join(lines).strip(), has_dikw
419
+
420
+
421
+ class GoogleVisionHandwritingEngine:
422
+ """Thin wrapper around the Vision API client. Call :meth:`load` once."""
423
+
424
+ def __init__(self) -> None:
425
+ self._client = None
426
+ self._enabled = False
427
+ self._last_spatial = False
428
+ self._last_has_dikw_structure = False
429
+
430
+ @property
431
+ def enabled(self) -> bool:
432
+ return self._enabled
433
+
434
+ @property
435
+ def last_used_spatial(self) -> bool:
436
+ return self._last_spatial
437
+
438
+ @property
439
+ def last_has_dikw_structure(self) -> bool:
440
+ return self._last_has_dikw_structure
441
+
442
+ def load(self) -> None:
443
+ """Load Vision client from env-based credentials.
444
+
445
+ Supported env keys:
446
+ - GOOGLE_VISION_SERVICE_ACCOUNT_JSON
447
+ - GOOGLE_APPLICATION_CREDENTIALS_JSON
448
+ """
449
+ try:
450
+ from google.cloud import vision
451
+ from google.oauth2 import service_account
452
+ except ImportError as exc: # pragma: no cover - exercised via install matrix
453
+ raise MissingDependencyError("google-cloud-vision", "vision") from exc
454
+
455
+ raw = (
456
+ os.environ.get("GOOGLE_VISION_SERVICE_ACCOUNT_JSON")
457
+ or os.environ.get("GOOGLE_APPLICATION_CREDENTIALS_JSON")
458
+ or ""
459
+ ).strip()
460
+
461
+ if not raw:
462
+ self._enabled = False
463
+ self._client = None
464
+ return
465
+
466
+ try:
467
+ info = json.loads(raw)
468
+ except json.JSONDecodeError:
469
+ self._enabled = False
470
+ self._client = None
471
+ return
472
+
473
+ creds = service_account.Credentials.from_service_account_info(info)
474
+ self._client = vision.ImageAnnotatorClient(credentials=creds)
475
+ self._enabled = True
476
+
477
+ def extract_text_from_bytes(self, image_bytes: bytes, ocr_lang: str = "en") -> str:
478
+ from google.cloud import vision
479
+
480
+ self._last_spatial = False
481
+ self._last_has_dikw_structure = False
482
+ if not self._enabled or self._client is None:
483
+ return ""
484
+
485
+ image = vision.Image(content=image_bytes)
486
+ hints = _language_hints(ocr_lang)
487
+ context = vision.ImageContext(language_hints=hints)
488
+ response = self._client.document_text_detection(image=image, image_context=context)
489
+
490
+ if response.error and response.error.message:
491
+ raise RuntimeError(f"Vision API error: {response.error.message}")
492
+
493
+ annotation = response.full_text_annotation
494
+ if annotation and annotation.pages:
495
+ flat = (annotation.text or "").strip()
496
+ has_dikw = detect_dikw_structure(flat) if flat else False
497
+ self._last_has_dikw_structure = has_dikw
498
+
499
+ # Always prefer Vision's flat reading-order text: it is the most faithful
500
+ # transcription. Layout (incl. DIKW diagrams) is reconstructed downstream by
501
+ # the LLM, which handles tightly-stacked notes far better than bbox clustering.
502
+ if flat:
503
+ lines = [ln.strip() for ln in flat.splitlines() if ln.strip()]
504
+ lines = [ln for ln in lines if not _is_margin_number_line(ln)]
505
+ lines = dedupe_prose_lines(lines)
506
+ if not has_dikw:
507
+ lines = merge_wrapped_prose_lines(lines)
508
+ text = "\n".join(lines)
509
+ return text
510
+
511
+ spatial, has_dikw_spatial = spatial_text_from_annotation(annotation)
512
+ if spatial and len(spatial) >= 10:
513
+ self._last_spatial = True
514
+ self._last_has_dikw_structure = has_dikw_spatial
515
+ return spatial
516
+
517
+ if response.text_annotations:
518
+ flat = (response.text_annotations[0].description or "").strip()
519
+ self._last_has_dikw_structure = detect_dikw_structure(flat)
520
+ return flat
521
+
522
+ return ""
523
+
524
+
525
+ def run_vision_on_page(
526
+ engine: GoogleVisionHandwritingEngine,
527
+ img_path: str,
528
+ ocr_lang: str = "en",
529
+ ) -> tuple[str, float]:
530
+ """OCR one page image with Google Vision. Returns (text, pseudo_confidence 0..1)."""
531
+ if not engine.enabled:
532
+ return "", 0.0
533
+
534
+ with open(img_path, "rb") as f:
535
+ text = engine.extract_text_from_bytes(f.read(), ocr_lang=ocr_lang)
536
+
537
+ conf = min(1.0, len(text) / 250.0) if text else 0.0
538
+ return text, conf
@@ -0,0 +1,45 @@
1
+ """Exception hierarchy for ocrcontext."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class OcrContextError(Exception):
7
+ """Base class for all ocrcontext errors."""
8
+
9
+
10
+ class MissingDependencyError(OcrContextError):
11
+ """A required optional dependency (extra) is not installed.
12
+
13
+ Raised lazily when an engine is first used so the base install stays light.
14
+ """
15
+
16
+ def __init__(self, package: str, extra: str) -> None:
17
+ self.package = package
18
+ self.extra = extra
19
+ super().__init__(
20
+ f"'{package}' is required for this feature but is not installed. "
21
+ f"Install it with: pip install 'ocrcontext[{extra}]'"
22
+ )
23
+
24
+
25
+ class UnsupportedFileError(OcrContextError):
26
+ """The provided file type / source could not be interpreted."""
27
+
28
+
29
+ class NoTextDetectedError(OcrContextError):
30
+ """OCR produced no usable text from the document."""
31
+
32
+
33
+ class LLMNotConfiguredError(OcrContextError):
34
+ """An LLM-dependent operation was requested without injecting a chat model."""
35
+
36
+ def __init__(self, operation: str = "this operation") -> None:
37
+ super().__init__(
38
+ f"{operation} requires a LangChain chat model. Pass one to Analyzer(llm=...), e.g.\n"
39
+ " from langchain_openai import ChatOpenAI\n"
40
+ " analyzer = Analyzer(llm=ChatOpenAI(model='gpt-4o'))"
41
+ )
42
+
43
+
44
+ class EngineError(OcrContextError):
45
+ """An OCR engine failed to initialize or run."""
@@ -0,0 +1,10 @@
1
+ """LLM layer: refinement, structured extraction, and fidelity guards.
2
+
3
+ Only ``langchain-core`` is required here. Bring your own provider package
4
+ (``langchain-openai``, ``langchain-anthropic``, ``langchain-ollama``, ...).
5
+ """
6
+
7
+ from .extractor import StructuredExtractor
8
+ from .refiner import Refiner
9
+
10
+ __all__ = ["Refiner", "StructuredExtractor"]
@@ -0,0 +1,58 @@
1
+ """Hallucination / drift guards, ported from lib/ocr/refine-drift.ts.
2
+
3
+ If LLM refinement diverges too far from the source OCR text, the raw text is
4
+ kept instead — fidelity over fluency.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+
11
+ _NON_ALNUM = re.compile(r"[^\w]", re.UNICODE)
12
+
13
+
14
+ def _normalize_token(t: str) -> str:
15
+ return _NON_ALNUM.sub("", t.lower())
16
+
17
+
18
+ def refine_hallucinated_length(original: str, refined: str) -> bool:
19
+ """Light guard for generous prose mode: flags wholesale length divergence only."""
20
+ o_words = len([w for w in original.split() if w])
21
+ r_words = len([w for w in refined.split() if w])
22
+ if o_words == 0:
23
+ return False
24
+ ratio = r_words / o_words
25
+ return ratio < 0.5 or ratio > 1.8
26
+
27
+
28
+ def refinement_drifted(original: str, refined: str) -> bool:
29
+ """Reject LLM refine output that diverges too far from source."""
30
+ o_lines = len([ln for ln in original.split("\n") if ln.strip()])
31
+ r_lines = len([ln for ln in refined.split("\n") if ln.strip()])
32
+ if o_lines > 0 and abs(r_lines - o_lines) / o_lines > 0.35:
33
+ return True
34
+
35
+ o_words = [w for w in original.split() if w]
36
+ r_words = [w for w in refined.split() if w]
37
+ if len(o_words) > 0 and abs(len(r_words) - len(o_words)) / len(o_words) > 0.25:
38
+ return True
39
+
40
+ # Line-by-line: too many wholly different words (e.g. var -> vakit, Elinde -> içinde)
41
+ o_line_arr = [ln for ln in original.split("\n") if ln.strip()]
42
+ r_line_arr = [ln for ln in refined.split("\n") if ln.strip()]
43
+ line_count = min(len(o_line_arr), len(r_line_arr))
44
+ if line_count >= 2:
45
+ changed_lines = 0
46
+ for i in range(line_count):
47
+ o_t = [t for t in (_normalize_token(x) for x in o_line_arr[i].split()) if t]
48
+ r_t = [t for t in (_normalize_token(x) for x in r_line_arr[i].split()) if t]
49
+ if len(o_t) != len(r_t):
50
+ changed_lines += 1
51
+ continue
52
+ diff = sum(1 for j in range(len(o_t)) if o_t[j] != r_t[j])
53
+ if diff > max(1, int(len(o_t) * 0.34)):
54
+ changed_lines += 1
55
+ if changed_lines / line_count > 0.4:
56
+ return True
57
+
58
+ return False