pdf-file-renamer 0.6.2__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/CHANGELOG.md +19 -0
  2. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/PKG-INFO +3 -2
  3. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/README.md +2 -1
  4. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/coverage.xml +75 -34
  5. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/pyproject.toml +1 -1
  6. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/doi/pdf2doi_extractor.py +139 -3
  7. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/.env.example +0 -0
  8. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/.github/workflows/ci.yml +0 -0
  9. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/.github/workflows/release.yml +0 -0
  10. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/.gitignore +0 -0
  11. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/.python-version +0 -0
  12. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/LICENSE +0 -0
  13. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/REFACTORING_SUMMARY.md +0 -0
  14. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/demo.gif +0 -0
  15. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/demo.tape +0 -0
  16. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/scripts/create_demo_gif.py +0 -0
  17. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/scripts/record_demo.sh +0 -0
  18. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/__init__.py +0 -0
  19. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/application/__init__.py +0 -0
  20. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/application/filename_service.py +0 -0
  21. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/application/pdf_rename_workflow.py +0 -0
  22. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/application/rename_service.py +0 -0
  23. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/domain/__init__.py +0 -0
  24. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/domain/models.py +0 -0
  25. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/domain/ports.py +0 -0
  26. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/__init__.py +0 -0
  27. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/config.py +0 -0
  28. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/doi/__init__.py +0 -0
  29. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/llm/__init__.py +0 -0
  30. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/llm/pydantic_ai_provider.py +0 -0
  31. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/pdf/__init__.py +0 -0
  32. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/pdf/composite.py +0 -0
  33. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/pdf/docling_extractor.py +0 -0
  34. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/infrastructure/pdf/pymupdf_extractor.py +0 -0
  35. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/main.py +0 -0
  36. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/presentation/__init__.py +0 -0
  37. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/presentation/cli.py +0 -0
  38. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/src/pdf_file_renamer/presentation/formatters.py +0 -0
  39. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/__init__.py +0 -0
  40. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/data/2025-dennis-managing-complexity.pdf +0 -0
  41. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/data/Camp_of_the_Saints.pdf +0 -0
  42. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/data/s43588-025-00854-1.pdf +0 -0
  43. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/test_domain_models.py +0 -0
  44. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/test_filename_service.py +0 -0
  45. {pdf_file_renamer-0.6.2 → pdf_file_renamer-0.6.3}/tests/test_rename_service.py +0 -0
@@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.6.3] - 2025-10-14
9
+
10
+ ### Fixed
11
+ - Fixed critical bug where pdf2doi extracted DOIs from citations instead of the paper's own DOI
12
+ - Added DOI validation to verify metadata matches PDF content before accepting DOI
13
+ - Prevents incorrect naming when papers don't have their own DOI but cite other papers
14
+
15
+ ### Added
16
+ - DOI metadata validation against PDF first page content
17
+ - Title similarity checking using SequenceMatcher
18
+ - Configurable validation thresholds for DOI matching
19
+ - Fallback to LLM-based naming when DOI validation fails
20
+
21
+ ### Changed
22
+ - DOI extraction now validates that extracted metadata matches the actual PDF content
23
+ - Improved accuracy by rejecting citation DOIs that don't match the paper's title
24
+ - DOI validation checks title area (first ~300 characters) instead of full document
25
+
8
26
  ## [0.6.2] - 2025-10-14
9
27
 
10
28
  ### Added
@@ -125,6 +143,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
125
143
  - Confidence scoring for suggestions
126
144
  - Support for custom output directories
127
145
 
146
+ [0.6.3]: https://github.com/nostoslabs/pdf-renamer/compare/v0.6.2...v0.6.3
128
147
  [0.6.2]: https://github.com/nostoslabs/pdf-renamer/compare/v0.6.1...v0.6.2
129
148
  [0.6.1]: https://github.com/nostoslabs/pdf-renamer/compare/v0.6.0...v0.6.1
130
149
  [0.6.0]: https://github.com/nostoslabs/pdf-renamer/compare/v0.5.0...v0.6.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pdf-file-renamer
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Intelligent PDF renaming using LLMs with DOI-based naming and interactive workflow
5
5
  Project-URL: Homepage, https://github.com/nostoslabs/pdf-renamer
6
6
  Project-URL: Repository, https://github.com/nostoslabs/pdf-renamer
@@ -285,7 +285,8 @@ The tool uses a multi-strategy approach to generate accurate filenames:
285
285
 
286
286
  1. **DOI Detection** (for academic papers)
287
287
  - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
288
- - If found, queries authoritative metadata (title, authors, year, journal)
288
+ - **Validates DOI metadata** against PDF content to prevent citation DOI mismatches
289
+ - If found and validated, queries authoritative metadata (title, authors, year, journal)
289
290
  - Generates filename with **very high confidence** from validated metadata
290
291
  - **Saves API costs** - no LLM call needed for papers with DOIs
291
292
 
@@ -237,7 +237,8 @@ The tool uses a multi-strategy approach to generate accurate filenames:
237
237
 
238
238
  1. **DOI Detection** (for academic papers)
239
239
  - Searches PDF for DOI identifiers using [pdf2doi](https://github.com/MicheleCotrufo/pdf2doi)
240
- - If found, queries authoritative metadata (title, authors, year, journal)
240
+ - **Validates DOI metadata** against PDF content to prevent citation DOI mismatches
241
+ - If found and validated, queries authoritative metadata (title, authors, year, journal)
241
242
  - Generates filename with **very high confidence** from validated metadata
242
243
  - **Saves API costs** - no LLM call needed for papers with DOIs
243
244
 
@@ -1,5 +1,5 @@
1
1
  <?xml version="1.0" ?>
2
- <coverage version="7.10.7" timestamp="1760490556851" lines-valid="728" lines-covered="142" line-rate="0.1951" branches-covered="0" branches-valid="0" branch-rate="0" complexity="0">
2
+ <coverage version="7.10.7" timestamp="1760499723997" lines-valid="769" lines-covered="142" line-rate="0.1847" branches-covered="0" branches-valid="0" branch-rate="0" complexity="0">
3
3
  <!-- Generated by coverage.py: https://coverage.readthedocs.io/en/7.10.7 -->
4
4
  <!-- Based on https://raw.githubusercontent.com/cobertura/web/master/htdocs/xml/coverage-04.dtd -->
5
5
  <sources>
@@ -329,44 +329,40 @@
329
329
  <line number="4" hits="0"/>
330
330
  <line number="5" hits="0"/>
331
331
  <line number="6" hits="0"/>
332
- <line number="8" hits="0"/>
333
- <line number="10" hits="0"/>
332
+ <line number="7" hits="0"/>
333
+ <line number="9" hits="0"/>
334
334
  <line number="11" hits="0"/>
335
- <line number="14" hits="0"/>
336
- <line number="17" hits="0"/>
337
- <line number="20" hits="0"/>
338
- <line number="22" hits="0"/>
339
- <line number="32" hits="0"/>
340
- <line number="34" hits="0"/>
341
- <line number="35" hits="0"/>
342
- <line number="38" hits="0"/>
343
- <line number="39" hits="0"/>
344
- <line number="42" hits="0"/>
335
+ <line number="12" hits="0"/>
336
+ <line number="15" hits="0"/>
337
+ <line number="18" hits="0"/>
338
+ <line number="27" hits="0"/>
339
+ <line number="28" hits="0"/>
340
+ <line number="29" hits="0"/>
341
+ <line number="31" hits="0"/>
342
+ <line number="41" hits="0"/>
345
343
  <line number="43" hits="0"/>
346
344
  <line number="44" hits="0"/>
347
- <line number="46" hits="0"/>
348
345
  <line number="47" hits="0"/>
349
346
  <line number="48" hits="0"/>
350
347
  <line number="51" hits="0"/>
351
- <line number="54" hits="0"/>
348
+ <line number="52" hits="0"/>
349
+ <line number="53" hits="0"/>
350
+ <line number="55" hits="0"/>
352
351
  <line number="56" hits="0"/>
353
352
  <line number="57" hits="0"/>
354
- <line number="58" hits="0"/>
355
- <line number="59" hits="0"/>
356
- <line number="62" hits="0"/>
353
+ <line number="60" hits="0"/>
354
+ <line number="63" hits="0"/>
357
355
  <line number="65" hits="0"/>
358
356
  <line number="66" hits="0"/>
359
357
  <line number="67" hits="0"/>
360
358
  <line number="68" hits="0"/>
361
- <line number="69" hits="0"/>
362
- <line number="70" hits="0"/>
363
359
  <line number="71" hits="0"/>
364
- <line number="72" hits="0"/>
365
- <line number="73" hits="0"/>
366
360
  <line number="74" hits="0"/>
367
361
  <line number="75" hits="0"/>
368
362
  <line number="76" hits="0"/>
369
363
  <line number="77" hits="0"/>
364
+ <line number="78" hits="0"/>
365
+ <line number="79" hits="0"/>
370
366
  <line number="80" hits="0"/>
371
367
  <line number="81" hits="0"/>
372
368
  <line number="82" hits="0"/>
@@ -375,28 +371,73 @@
375
371
  <line number="85" hits="0"/>
376
372
  <line number="86" hits="0"/>
377
373
  <line number="89" hits="0"/>
374
+ <line number="90" hits="0"/>
375
+ <line number="91" hits="0"/>
378
376
  <line number="92" hits="0"/>
377
+ <line number="93" hits="0"/>
379
378
  <line number="94" hits="0"/>
380
- <line number="104" hits="0"/>
381
- <line number="106" hits="0"/>
382
- <line number="108" hits="0"/>
379
+ <line number="95" hits="0"/>
380
+ <line number="98" hits="0"/>
381
+ <line number="101" hits="0"/>
382
+ <line number="103" hits="0"/>
383
+ <line number="114" hits="0"/>
384
+ <line number="116" hits="0"/>
385
+ <line number="117" hits="0"/>
383
386
  <line number="119" hits="0"/>
384
- <line number="120" hits="0"/>
387
+ <line number="121" hits="0"/>
385
388
  <line number="123" hits="0"/>
386
- <line number="124" hits="0"/>
387
- <line number="126" hits="0"/>
389
+ <line number="125" hits="0"/>
388
390
  <line number="127" hits="0"/>
389
- <line number="129" hits="0"/>
390
- <line number="131" hits="0"/>
391
- <line number="141" hits="0"/>
391
+ <line number="138" hits="0"/>
392
+ <line number="139" hits="0"/>
392
393
  <line number="142" hits="0"/>
394
+ <line number="143" hits="0"/>
393
395
  <line number="145" hits="0"/>
394
396
  <line number="146" hits="0"/>
395
397
  <line number="148" hits="0"/>
396
- <line number="149" hits="0"/>
397
- <line number="151" hits="0"/>
398
- <line number="154" hits="0"/>
398
+ <line number="150" hits="0"/>
399
399
  <line number="160" hits="0"/>
400
+ <line number="161" hits="0"/>
401
+ <line number="164" hits="0"/>
402
+ <line number="165" hits="0"/>
403
+ <line number="167" hits="0"/>
404
+ <line number="168" hits="0"/>
405
+ <line number="170" hits="0"/>
406
+ <line number="173" hits="0"/>
407
+ <line number="179" hits="0"/>
408
+ <line number="181" hits="0"/>
409
+ <line number="191" hits="0"/>
410
+ <line number="192" hits="0"/>
411
+ <line number="194" hits="0"/>
412
+ <line number="196" hits="0"/>
413
+ <line number="197" hits="0"/>
414
+ <line number="198" hits="0"/>
415
+ <line number="199" hits="0"/>
416
+ <line number="200" hits="0"/>
417
+ <line number="202" hits="0"/>
418
+ <line number="203" hits="0"/>
419
+ <line number="204" hits="0"/>
420
+ <line number="206" hits="0"/>
421
+ <line number="220" hits="0"/>
422
+ <line number="222" hits="0"/>
423
+ <line number="225" hits="0"/>
424
+ <line number="226" hits="0"/>
425
+ <line number="229" hits="0"/>
426
+ <line number="230" hits="0"/>
427
+ <line number="234" hits="0"/>
428
+ <line number="235" hits="0"/>
429
+ <line number="237" hits="0"/>
430
+ <line number="238" hits="0"/>
431
+ <line number="242" hits="0"/>
432
+ <line number="243" hits="0"/>
433
+ <line number="244" hits="0"/>
434
+ <line number="247" hits="0"/>
435
+ <line number="248" hits="0"/>
436
+ <line number="250" hits="0"/>
437
+ <line number="252" hits="0"/>
438
+ <line number="263" hits="0"/>
439
+ <line number="293" hits="0"/>
440
+ <line number="296" hits="0"/>
400
441
  </lines>
401
442
  </class>
402
443
  </classes>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pdf-file-renamer"
3
- version = "0.6.2"
3
+ version = "0.6.3"
4
4
  description = "Intelligent PDF renaming using LLMs with DOI-based naming and interactive workflow"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -3,6 +3,7 @@
3
3
  import asyncio
4
4
  import contextlib
5
5
  import re
6
+ from difflib import SequenceMatcher
6
7
  from pathlib import Path
7
8
 
8
9
  import pdf2doi
@@ -14,10 +15,18 @@ from pdf_file_renamer.domain.ports import DOIExtractor
14
15
  class PDF2DOIExtractor(DOIExtractor):
15
16
  """Extract DOI from PDF files using pdf2doi library."""
16
17
 
17
- def __init__(self) -> None:
18
- """Initialize the PDF2DOI extractor."""
18
+ def __init__(self, validate_match: bool = True, similarity_threshold: float = 0.3) -> None:
19
+ """
20
+ Initialize the PDF2DOI extractor.
21
+
22
+ Args:
23
+ validate_match: Whether to validate that DOI metadata matches PDF content
24
+ similarity_threshold: Minimum similarity score (0-1) for title validation
25
+ """
19
26
  # Suppress pdf2doi verbose output
20
27
  pdf2doi.config.set("verbose", False)
28
+ self.validate_match = validate_match
29
+ self.similarity_threshold = similarity_threshold
21
30
 
22
31
  async def extract_doi(self, pdf_path: Path) -> DOIMetadata | None:
23
32
  """
@@ -91,7 +100,7 @@ class PDF2DOIExtractor(DOIExtractor):
91
100
  # Extract publisher
92
101
  publisher = metadata.get("publisher")
93
102
 
94
- return DOIMetadata(
103
+ doi_metadata = DOIMetadata(
95
104
  doi=identifier,
96
105
  title=title,
97
106
  authors=authors,
@@ -101,6 +110,16 @@ class PDF2DOIExtractor(DOIExtractor):
101
110
  raw_bibtex=validation_info if validation_info else None,
102
111
  )
103
112
 
113
+ # Validate that the DOI metadata matches the PDF content
114
+ if self.validate_match:
115
+ # Extract first page text from PDF to check for title match
116
+ pdf_text = await self._extract_pdf_first_page(pdf_path)
117
+ if not self._validate_doi_matches_pdf(doi_metadata, pdf_text):
118
+ # DOI doesn't match - likely a citation DOI, not the paper's DOI
119
+ return None
120
+
121
+ return doi_metadata
122
+
104
123
  except Exception:
105
124
  # Silently fail - DOI extraction is opportunistic
106
125
  return None
@@ -158,3 +177,120 @@ class PDF2DOIExtractor(DOIExtractor):
158
177
  ]
159
178
 
160
179
  return authors if authors else None
180
+
181
+ async def _extract_pdf_first_page(self, pdf_path: Path) -> str:
182
+ """
183
+ Extract text from the first page of a PDF.
184
+
185
+ Args:
186
+ pdf_path: Path to PDF file
187
+
188
+ Returns:
189
+ Text from first page (empty string if extraction fails)
190
+ """
191
+ try:
192
+ import fitz # PyMuPDF
193
+
194
+ loop = asyncio.get_event_loop()
195
+
196
+ def extract() -> str:
197
+ with fitz.open(pdf_path) as doc:
198
+ if len(doc) > 0:
199
+ return doc[0].get_text()
200
+ return ""
201
+
202
+ return await loop.run_in_executor(None, extract)
203
+ except Exception:
204
+ return ""
205
+
206
+ def _validate_doi_matches_pdf(self, doi_metadata: DOIMetadata, pdf_text: str) -> bool:
207
+ """
208
+ Validate that DOI metadata matches the PDF content.
209
+
210
+ This checks if the title from the DOI metadata appears in the PDF text
211
+ (particularly the first page, where the title should be).
212
+
213
+ Args:
214
+ doi_metadata: DOI metadata to validate
215
+ pdf_text: Text from PDF first page (not full document!)
216
+
217
+ Returns:
218
+ True if metadata appears to match PDF, False otherwise
219
+ """
220
+ if not doi_metadata.title or not pdf_text:
221
+ # If we can't validate, assume it's valid (fail open)
222
+ return True
223
+
224
+ # Normalize text for comparison
225
+ pdf_text_lower = pdf_text.lower()
226
+ title_lower = doi_metadata.title.lower()
227
+
228
+ # Check if the full title appears in the PDF text
229
+ if title_lower in pdf_text_lower:
230
+ return True
231
+
232
+ # Check similarity using SequenceMatcher on first ~300 chars (title area)
233
+ # Most paper titles appear in the first few hundred characters
234
+ title_area = pdf_text_lower[:300]
235
+ similarity = SequenceMatcher(None, title_lower, title_area).ratio()
236
+
237
+ if similarity >= self.similarity_threshold:
238
+ return True
239
+
240
+ # Check if significant words from title appear in the title area ONLY
241
+ # This prevents matching citation DOIs from the references section
242
+ title_words = self._extract_significant_words(title_lower)
243
+ if not title_words:
244
+ return True # Can't validate, fail open
245
+
246
+ # Require at least 70% of significant words to appear in the title area
247
+ matches = sum(1 for word in title_words if word in title_area)
248
+ match_ratio = matches / len(title_words)
249
+
250
+ return match_ratio >= 0.7
251
+
252
+ def _extract_significant_words(self, text: str) -> list[str]:
253
+ """
254
+ Extract significant words from text (removing common words).
255
+
256
+ Args:
257
+ text: Input text
258
+
259
+ Returns:
260
+ List of significant words
261
+ """
262
+ # Common words to skip
263
+ stop_words = {
264
+ "a",
265
+ "an",
266
+ "the",
267
+ "and",
268
+ "or",
269
+ "but",
270
+ "in",
271
+ "on",
272
+ "at",
273
+ "to",
274
+ "for",
275
+ "of",
276
+ "with",
277
+ "by",
278
+ "from",
279
+ "as",
280
+ "is",
281
+ "was",
282
+ "are",
283
+ "were",
284
+ "been",
285
+ "be",
286
+ "this",
287
+ "that",
288
+ "these",
289
+ "those",
290
+ }
291
+
292
+ # Extract words (alphanumeric only)
293
+ words = re.findall(r"\b\w+\b", text.lower())
294
+
295
+ # Filter stop words and short words
296
+ return [w for w in words if w not in stop_words and len(w) > 3]