academic-refchecker 2.0.12__py3-none-any.whl → 2.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/METADATA +1 -1
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/RECORD +17 -17
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/WHEEL +1 -1
- backend/main.py +33 -5
- backend/refchecker_wrapper.py +42 -1
- backend/thumbnail.py +117 -0
- refchecker/__version__.py +1 -1
- refchecker/checkers/arxiv_citation.py +181 -49
- refchecker/checkers/enhanced_hybrid_checker.py +117 -4
- refchecker/checkers/semantic_scholar.py +43 -1
- refchecker/llm/base.py +1 -15
- refchecker/llm/providers.py +102 -113
- refchecker/utils/author_utils.py +15 -2
- refchecker/utils/bibliography_utils.py +2 -2
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
academic_refchecker-2.0.
|
|
1
|
+
academic_refchecker-2.0.14.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
2
2
|
backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
|
|
3
3
|
backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
|
|
4
4
|
backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
|
|
5
5
|
backend/concurrency.py,sha256=2KY9I_8dDkyl_HTGx27ZxU4rFXx2vqbGOlo5RrRbPjA,3223
|
|
6
6
|
backend/database.py,sha256=1jLP1m9vNk5sEs4bh_xmX0T5ilZkUTX1c7nOVz5XnNc,30681
|
|
7
|
-
backend/main.py,sha256=
|
|
7
|
+
backend/main.py,sha256=Vh_hbLAGYZzitwXAZFRiU68S25ySGPmNGepQo2qwzfQ,56298
|
|
8
8
|
backend/models.py,sha256=El2F-RTHgxQ7-WODmiYCpjsTFDpjwF9PBt-JDa_XipE,2591
|
|
9
|
-
backend/refchecker_wrapper.py,sha256=
|
|
10
|
-
backend/thumbnail.py,sha256=
|
|
9
|
+
backend/refchecker_wrapper.py,sha256=p7xAR-mSaJ-Mj2RgelrecZ6PhpjajzlwOWRIf9PpQiw,55100
|
|
10
|
+
backend/thumbnail.py,sha256=zw6wLMyv9g4p83yqICh2ZHOAWK0WR6E8HMV6o-ocPmc,22251
|
|
11
11
|
backend/websocket_manager.py,sha256=l-Wou-rKV6n7t6Gcf5fR6s_4G-mssSrba0davNnYS70,4247
|
|
12
12
|
backend/static/favicon.svg,sha256=R0oQauh16Uy0D7JlT27k-zdjJtrvfPKOe9La5vKYwuM,395
|
|
13
13
|
backend/static/index.html,sha256=eJDL5t98ZJOl85d1_kJNNSUhmgGft_PCKcgbdG0UvCw,598
|
|
@@ -16,17 +16,17 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
|
|
|
16
16
|
backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
|
|
17
17
|
refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
|
|
18
18
|
refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
|
|
19
|
-
refchecker/__version__.py,sha256
|
|
19
|
+
refchecker/__version__.py,sha256=m3Fjueai8G45bivp4BWjHv85KJhSXCTkbngykxhZdiU,66
|
|
20
20
|
refchecker/checkers/__init__.py,sha256=-dR7HX0bfPq9YMXrnODoYbfNWFLqu706xoVsUdWHYRI,611
|
|
21
|
-
refchecker/checkers/arxiv_citation.py,sha256=
|
|
21
|
+
refchecker/checkers/arxiv_citation.py,sha256=7r75KZzDjrFtFDvjZM7ib9m-YDsTLLfquXkWvE32pf0,23097
|
|
22
22
|
refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
|
|
23
|
-
refchecker/checkers/enhanced_hybrid_checker.py,sha256=
|
|
23
|
+
refchecker/checkers/enhanced_hybrid_checker.py,sha256=bimUqYFXeEBABdVb4nOuEohHKGsu1zCWt0F-tlzlbDY,35674
|
|
24
24
|
refchecker/checkers/github_checker.py,sha256=YJ2sLj22qezw3uWjA0jhtDO0fOW4HUwcVbv2DQ4LjR0,14277
|
|
25
25
|
refchecker/checkers/local_semantic_scholar.py,sha256=c-KUTh99s-Di71h-pzdrwlPgoSTwB-tgVAZnCrMFXmw,21011
|
|
26
26
|
refchecker/checkers/openalex.py,sha256=WEjEppQMbutPs8kWOSorCIoXWqpJ9o1CXUicThHSWYU,20120
|
|
27
27
|
refchecker/checkers/openreview_checker.py,sha256=0IHZe4Nscy8fle28rmhy1hhsofR5g0FFSakk8FFH_0A,40540
|
|
28
28
|
refchecker/checkers/pdf_paper_checker.py,sha256=lrg09poNJBz9FNMrUoEjQ6CJbdYZAVANw0bCaTSb5oo,19904
|
|
29
|
-
refchecker/checkers/semantic_scholar.py,sha256=
|
|
29
|
+
refchecker/checkers/semantic_scholar.py,sha256=Cpi94DtJEBcMUsdOq5dbdgOX4lK4Eh4Dv-Skpw5-c1Q,37243
|
|
30
30
|
refchecker/checkers/webpage_checker.py,sha256=A_d5kg3OOsyliC00OVq_l0J-RJ4Ln7hUoURk21aO2fs,43653
|
|
31
31
|
refchecker/config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
32
32
|
refchecker/config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
@@ -38,8 +38,8 @@ refchecker/core/refchecker.py,sha256=nX8guDXFL1ZdT-K6KUJT_3iZjuoYsWj4e0rKrqd5VZA
|
|
|
38
38
|
refchecker/database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
39
39
|
refchecker/database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
40
40
|
refchecker/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
refchecker/llm/base.py,sha256=
|
|
42
|
-
refchecker/llm/providers.py,sha256=
|
|
41
|
+
refchecker/llm/base.py,sha256=BhpnUn7nrN8LzAnA8rQuG3zBvNovFYxShk1V9oAHlHU,16248
|
|
42
|
+
refchecker/llm/providers.py,sha256=2pOEre_OH_shgm0b9m3_nVIxyoY-MxhFM5KAP_qKo_Q,39131
|
|
43
43
|
refchecker/scripts/__init__.py,sha256=xJwo6afG8s7S888BK2Bxw2d7FX8aLkbl0l_ZoJOFibE,37
|
|
44
44
|
refchecker/scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,4213
|
|
45
45
|
refchecker/services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
@@ -47,9 +47,9 @@ refchecker/services/pdf_processor.py,sha256=7i5x043qfnyzE5EQmytfy_uPjbeCJp4Ka5OP
|
|
|
47
47
|
refchecker/utils/__init__.py,sha256=SKTEQeKpLOFFMIzZiakzctsW9zGe_J7LDNJlygWV6RY,1221
|
|
48
48
|
refchecker/utils/arxiv_rate_limiter.py,sha256=axOv84Ge6q_mJ69lcyAFsCmHx9qXvV1aX71oSaxhnjE,4119
|
|
49
49
|
refchecker/utils/arxiv_utils.py,sha256=C7wqoCy9FZUQpoF92vLeJyrK1-6XoMmmL6u_hfDV3ro,18031
|
|
50
|
-
refchecker/utils/author_utils.py,sha256=
|
|
50
|
+
refchecker/utils/author_utils.py,sha256=aFO3nYQptWTpL6GzAxGGfi7rJ9wnhvxe_swrtueJhIQ,6095
|
|
51
51
|
refchecker/utils/biblatex_parser.py,sha256=IKRUMtRsjdXIktyk9XGArt_ms0asmqP549uhFvvumuE,25581
|
|
52
|
-
refchecker/utils/bibliography_utils.py,sha256
|
|
52
|
+
refchecker/utils/bibliography_utils.py,sha256=-sc9VP1DglahAawl_ySZrnxH0Z-CBz1QOeEEfv39EuI,11766
|
|
53
53
|
refchecker/utils/bibtex_parser.py,sha256=xY0dEqT8lBZF-W21YRpG28lp_F2ikLan7nK70WiCU2o,15286
|
|
54
54
|
refchecker/utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
55
55
|
refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
@@ -59,8 +59,8 @@ refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLH
|
|
|
59
59
|
refchecker/utils/text_utils.py,sha256=Tx1k0SqS1cmw4N9BDJY-Ipep2T-HMmKPqi4SMcq1ZJ8,235751
|
|
60
60
|
refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
61
61
|
refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
|
|
62
|
-
academic_refchecker-2.0.
|
|
63
|
-
academic_refchecker-2.0.
|
|
64
|
-
academic_refchecker-2.0.
|
|
65
|
-
academic_refchecker-2.0.
|
|
66
|
-
academic_refchecker-2.0.
|
|
62
|
+
academic_refchecker-2.0.14.dist-info/METADATA,sha256=6Pf7sAGjekcj-xUFp7oRAGd1u4AH8ttgmVfDaai7bX8,26611
|
|
63
|
+
academic_refchecker-2.0.14.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
64
|
+
academic_refchecker-2.0.14.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
|
|
65
|
+
academic_refchecker-2.0.14.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
|
|
66
|
+
academic_refchecker-2.0.14.dist-info/RECORD,,
|
backend/main.py
CHANGED
|
@@ -27,6 +27,7 @@ from .thumbnail import (
|
|
|
27
27
|
generate_pdf_thumbnail_async,
|
|
28
28
|
generate_pdf_preview_async,
|
|
29
29
|
get_text_thumbnail_async,
|
|
30
|
+
get_text_preview_async,
|
|
30
31
|
get_thumbnail_cache_path,
|
|
31
32
|
get_preview_cache_path
|
|
32
33
|
)
|
|
@@ -220,12 +221,15 @@ async def start_check(
|
|
|
220
221
|
elif source_type == "text":
|
|
221
222
|
if not source_text:
|
|
222
223
|
raise HTTPException(status_code=400, detail="No text provided")
|
|
224
|
+
# Normalize line endings - remove all \r to prevent double carriage returns
|
|
225
|
+
# Browser may send \r\n, and Windows file writing can add extra \r
|
|
226
|
+
normalized_text = source_text.replace('\r\n', '\n').replace('\r', '\n')
|
|
223
227
|
# Save pasted text to a file for later retrieval and thumbnail generation
|
|
224
228
|
text_dir = Path(tempfile.gettempdir()) / "refchecker_texts"
|
|
225
229
|
text_dir.mkdir(parents=True, exist_ok=True)
|
|
226
230
|
text_file_path = text_dir / f"pasted_{session_id}.txt"
|
|
227
|
-
with open(text_file_path, "w", encoding="utf-8") as f:
|
|
228
|
-
f.write(
|
|
231
|
+
with open(text_file_path, "w", encoding="utf-8", newline='\n') as f:
|
|
232
|
+
f.write(normalized_text)
|
|
229
233
|
paper_source = str(text_file_path)
|
|
230
234
|
paper_title = "Pasted Text"
|
|
231
235
|
elif source_type == "url":
|
|
@@ -646,9 +650,33 @@ async def get_preview(check_id: int):
|
|
|
646
650
|
media_type="image/png",
|
|
647
651
|
headers={"Cache-Control": "public, max-age=86400"} # Cache for 1 day
|
|
648
652
|
)
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
653
|
+
|
|
654
|
+
# For text sources, generate a high-resolution text preview for overlay display
|
|
655
|
+
if source_type == 'text':
|
|
656
|
+
logger.info(f"Generating text preview for check {check_id}")
|
|
657
|
+
preview_path = await get_text_preview_async(check_id, "", paper_source)
|
|
658
|
+
if preview_path and os.path.exists(preview_path):
|
|
659
|
+
return FileResponse(
|
|
660
|
+
preview_path,
|
|
661
|
+
media_type="image/png",
|
|
662
|
+
headers={"Cache-Control": "public, max-age=86400"}
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
# For non-PDF file uploads, also generate a text preview
|
|
666
|
+
if source_type == 'file' and not paper_source.lower().endswith('.pdf'):
|
|
667
|
+
logger.info(f"Generating text preview for uploaded file check {check_id}")
|
|
668
|
+
if os.path.exists(paper_source):
|
|
669
|
+
preview_path = await get_text_preview_async(check_id, "", paper_source)
|
|
670
|
+
else:
|
|
671
|
+
preview_path = await get_text_preview_async(check_id, "Uploaded file")
|
|
672
|
+
if preview_path and os.path.exists(preview_path):
|
|
673
|
+
return FileResponse(
|
|
674
|
+
preview_path,
|
|
675
|
+
media_type="image/png",
|
|
676
|
+
headers={"Cache-Control": "public, max-age=86400"}
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
raise HTTPException(status_code=404, detail="Could not generate preview")
|
|
652
680
|
|
|
653
681
|
except HTTPException:
|
|
654
682
|
raise
|
backend/refchecker_wrapper.py
CHANGED
|
@@ -3,6 +3,7 @@ Wrapper around refchecker library with progress callbacks for real-time updates
|
|
|
3
3
|
"""
|
|
4
4
|
import sys
|
|
5
5
|
import os
|
|
6
|
+
import re
|
|
6
7
|
import asyncio
|
|
7
8
|
import logging
|
|
8
9
|
import tempfile
|
|
@@ -238,6 +239,18 @@ class ProgressRefChecker:
|
|
|
238
239
|
if not any(u.get('url') == doi_url for u in authoritative_urls):
|
|
239
240
|
authoritative_urls.append({"type": "doi", "url": doi_url})
|
|
240
241
|
|
|
242
|
+
# Add Semantic Scholar URL if available
|
|
243
|
+
s2_paper_id = external_ids.get('S2PaperId')
|
|
244
|
+
if s2_paper_id:
|
|
245
|
+
s2_url = f"https://www.semanticscholar.org/paper/{s2_paper_id}"
|
|
246
|
+
if not any(u.get('url') == s2_url for u in authoritative_urls):
|
|
247
|
+
authoritative_urls.append({"type": "semantic_scholar", "url": s2_url})
|
|
248
|
+
|
|
249
|
+
# Also check for inline S2 URL (from merged data)
|
|
250
|
+
s2_inline_url = verified_data.get('_semantic_scholar_url')
|
|
251
|
+
if s2_inline_url and not any(u.get('url') == s2_inline_url for u in authoritative_urls):
|
|
252
|
+
authoritative_urls.append({"type": "semantic_scholar", "url": s2_inline_url})
|
|
253
|
+
|
|
241
254
|
# Format errors, warnings, and suggestions
|
|
242
255
|
formatted_errors = []
|
|
243
256
|
formatted_warnings = []
|
|
@@ -462,11 +475,20 @@ class ProgressRefChecker:
|
|
|
462
475
|
raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
|
|
463
476
|
pdf_processor = PDFProcessor()
|
|
464
477
|
paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
|
|
465
|
-
elif paper_source.lower().endswith(('.tex', '.txt')):
|
|
478
|
+
elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
|
|
466
479
|
def read_file():
|
|
467
480
|
with open(paper_source, 'r', encoding='utf-8') as f:
|
|
468
481
|
return f.read()
|
|
469
482
|
paper_text = await asyncio.to_thread(read_file)
|
|
483
|
+
|
|
484
|
+
# For .bib files, extract references directly using BibTeX parser
|
|
485
|
+
if paper_source.lower().endswith('.bib'):
|
|
486
|
+
logger.info("Processing uploaded .bib file as BibTeX")
|
|
487
|
+
refs_result = await self._extract_references_from_bibtex(paper_text)
|
|
488
|
+
if refs_result and refs_result[0]:
|
|
489
|
+
arxiv_source_references = refs_result[0]
|
|
490
|
+
extraction_method = 'bib'
|
|
491
|
+
logger.info(f"Extracted {len(arxiv_source_references)} references from .bib file")
|
|
470
492
|
else:
|
|
471
493
|
raise ValueError(f"Unsupported file type: {paper_source}")
|
|
472
494
|
elif source_type == "text":
|
|
@@ -494,6 +516,25 @@ class ProgressRefChecker:
|
|
|
494
516
|
arxiv_source_references = refs_result[0]
|
|
495
517
|
extraction_method = 'bbl' # Mark as bbl extraction
|
|
496
518
|
logger.info(f"Extracted {len(arxiv_source_references)} references from pasted .bbl content")
|
|
519
|
+
# Check if the pasted text is BibTeX format (@article, @misc, @inproceedings, etc.)
|
|
520
|
+
elif re.search(r'@\s*(article|book|inproceedings|incollection|misc|techreport|phdthesis|mastersthesis|conference|inbook|proceedings)\s*\{', paper_text, re.IGNORECASE):
|
|
521
|
+
logger.info("Detected BibTeX format in pasted text")
|
|
522
|
+
refs_result = await self._extract_references_from_bibtex(paper_text)
|
|
523
|
+
if refs_result and refs_result[0]:
|
|
524
|
+
arxiv_source_references = refs_result[0]
|
|
525
|
+
extraction_method = 'bib' # Mark as bib extraction
|
|
526
|
+
logger.info(f"Extracted {len(arxiv_source_references)} references from pasted BibTeX content")
|
|
527
|
+
# Fallback: Try BibTeX parsing anyway for partial/malformed content
|
|
528
|
+
# This handles cases like incomplete paste, or BibTeX-like content without standard entry types
|
|
529
|
+
elif any(marker in paper_text for marker in ['title={', 'author={', 'year={', 'eprint={', '@']):
|
|
530
|
+
logger.info("Detected possible BibTeX-like content, attempting parse")
|
|
531
|
+
refs_result = await self._extract_references_from_bibtex(paper_text)
|
|
532
|
+
if refs_result and refs_result[0]:
|
|
533
|
+
arxiv_source_references = refs_result[0]
|
|
534
|
+
extraction_method = 'bib'
|
|
535
|
+
logger.info(f"Extracted {len(arxiv_source_references)} references from partial BibTeX content")
|
|
536
|
+
else:
|
|
537
|
+
logger.warning("BibTeX-like content detected but parsing failed, will try LLM extraction")
|
|
497
538
|
# Don't update title for pasted text - keep the placeholder
|
|
498
539
|
else:
|
|
499
540
|
raise ValueError(f"Unsupported source type: {source_type}")
|
backend/thumbnail.py
CHANGED
|
@@ -416,6 +416,13 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
|
|
|
416
416
|
except Exception as e:
|
|
417
417
|
logger.warning(f"Could not read text file: {e}")
|
|
418
418
|
|
|
419
|
+
# Clean up text content - remove excessive blank lines that cause rendering issues
|
|
420
|
+
if text_content:
|
|
421
|
+
# Normalize line endings and remove consecutive blank lines
|
|
422
|
+
lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
|
|
423
|
+
# Keep only non-empty lines
|
|
424
|
+
text_content = '\n'.join(line for line in lines if line.strip())
|
|
425
|
+
|
|
419
426
|
# Create a document-like image with actual text content
|
|
420
427
|
doc = fitz.open()
|
|
421
428
|
page = doc.new_page(width=THUMBNAIL_WIDTH, height=int(THUMBNAIL_WIDTH * 1.4))
|
|
@@ -483,6 +490,116 @@ def get_text_thumbnail(check_id: int, text_preview: str = "", text_file_path: st
|
|
|
483
490
|
return None
|
|
484
491
|
|
|
485
492
|
|
|
493
|
+
def get_text_preview(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
494
|
+
"""
|
|
495
|
+
Generate a high-resolution preview for pasted text showing actual content.
|
|
496
|
+
|
|
497
|
+
Creates a larger image (similar to PDF previews) with the text content.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
check_id: Check ID for naming
|
|
501
|
+
text_preview: Optional first few lines of text to display
|
|
502
|
+
text_file_path: Optional path to the text file to read content from
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Path to the generated preview, or None if generation failed
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
import fitz
|
|
509
|
+
|
|
510
|
+
output_path = get_preview_cache_path(f"text_{check_id}", check_id)
|
|
511
|
+
|
|
512
|
+
if output_path.exists():
|
|
513
|
+
return str(output_path)
|
|
514
|
+
|
|
515
|
+
# Try to read text content from file
|
|
516
|
+
text_content = text_preview
|
|
517
|
+
if text_file_path and os.path.exists(text_file_path):
|
|
518
|
+
try:
|
|
519
|
+
with open(text_file_path, 'r', encoding='utf-8') as f:
|
|
520
|
+
text_content = f.read()
|
|
521
|
+
except Exception as e:
|
|
522
|
+
logger.warning(f"Could not read text file: {e}")
|
|
523
|
+
|
|
524
|
+
# Clean up text content - remove excessive blank lines that cause rendering issues
|
|
525
|
+
if text_content:
|
|
526
|
+
# Normalize line endings and remove consecutive blank lines
|
|
527
|
+
lines = text_content.replace('\r\n', '\n').replace('\r', '\n').split('\n')
|
|
528
|
+
# Keep only non-empty lines
|
|
529
|
+
text_content = '\n'.join(line for line in lines if line.strip())
|
|
530
|
+
|
|
531
|
+
# Create a document-like image with actual text content at high resolution
|
|
532
|
+
doc = fitz.open()
|
|
533
|
+
page = doc.new_page(width=PREVIEW_WIDTH, height=int(PREVIEW_WIDTH * 1.4))
|
|
534
|
+
|
|
535
|
+
# Fill with white/off-white background
|
|
536
|
+
page.draw_rect(page.rect, color=(0.9, 0.9, 0.9), fill=(0.98, 0.98, 0.98))
|
|
537
|
+
|
|
538
|
+
# Draw border
|
|
539
|
+
page.draw_rect(page.rect, color=(0.7, 0.7, 0.7), width=2)
|
|
540
|
+
|
|
541
|
+
# Draw actual text content if available
|
|
542
|
+
margin = 40
|
|
543
|
+
if text_content:
|
|
544
|
+
# Create a text box for the content
|
|
545
|
+
text_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, int(PREVIEW_WIDTH * 1.4) - margin)
|
|
546
|
+
|
|
547
|
+
# Truncate to first ~4000 chars for preview
|
|
548
|
+
display_text = text_content[:4000]
|
|
549
|
+
if len(text_content) > 4000:
|
|
550
|
+
display_text += "\n\n..."
|
|
551
|
+
|
|
552
|
+
# Insert text with readable font size
|
|
553
|
+
page.insert_textbox(
|
|
554
|
+
text_rect,
|
|
555
|
+
display_text,
|
|
556
|
+
fontsize=14,
|
|
557
|
+
color=(0.15, 0.15, 0.15),
|
|
558
|
+
fontname="helv"
|
|
559
|
+
)
|
|
560
|
+
else:
|
|
561
|
+
# Fallback: Draw placeholder
|
|
562
|
+
header_rect = fitz.Rect(margin, margin, PREVIEW_WIDTH - margin, margin + 60)
|
|
563
|
+
page.insert_textbox(header_rect, "Pasted Text", fontsize=36, color=(0.3, 0.3, 0.5))
|
|
564
|
+
|
|
565
|
+
# Draw placeholder lines
|
|
566
|
+
line_height = 24
|
|
567
|
+
y = margin + 100
|
|
568
|
+
|
|
569
|
+
for i in range(20):
|
|
570
|
+
line_width = PREVIEW_WIDTH - 2 * margin
|
|
571
|
+
if i % 3 == 2:
|
|
572
|
+
line_width = line_width * 0.7
|
|
573
|
+
|
|
574
|
+
page.draw_line(
|
|
575
|
+
fitz.Point(margin, y),
|
|
576
|
+
fitz.Point(margin + line_width, y),
|
|
577
|
+
color=(0.7, 0.7, 0.7),
|
|
578
|
+
width=3
|
|
579
|
+
)
|
|
580
|
+
y += line_height
|
|
581
|
+
|
|
582
|
+
# Render to pixmap and save
|
|
583
|
+
pix = page.get_pixmap(alpha=False)
|
|
584
|
+
pix.save(str(output_path))
|
|
585
|
+
doc.close()
|
|
586
|
+
|
|
587
|
+
logger.info(f"Generated text preview: {output_path}")
|
|
588
|
+
return str(output_path)
|
|
589
|
+
|
|
590
|
+
except ImportError:
|
|
591
|
+
logger.error("PyMuPDF (fitz) is not installed")
|
|
592
|
+
return None
|
|
593
|
+
except Exception as e:
|
|
594
|
+
logger.error(f"Error generating text preview: {e}")
|
|
595
|
+
return None
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
async def get_text_preview_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
599
|
+
"""Async wrapper for text preview generation."""
|
|
600
|
+
return await asyncio.to_thread(get_text_preview, check_id, text_preview, text_file_path)
|
|
601
|
+
|
|
602
|
+
|
|
486
603
|
async def get_text_thumbnail_async(check_id: int, text_preview: str = "", text_file_path: str = "") -> Optional[str]:
|
|
487
604
|
"""Async wrapper for text thumbnail generation."""
|
|
488
605
|
return await asyncio.to_thread(get_text_thumbnail, check_id, text_preview, text_file_path)
|
refchecker/__version__.py
CHANGED