PyPI - academic-refchecker - Versions diffs - 1.2.35__tar.gz → 1.2.37__tar.gz - Mend

academic-refchecker 1.2.35tar.gz → 1.2.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{academic_refchecker-1.2.35/src/academic_refchecker.egg-info → academic_refchecker-1.2.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.35
+Version: 1.2.37
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/__version__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.35"
+__version__ = "1.2.37"

{academic_refchecker-1.2.35 → academic_refchecker-1.2.37/src/academic_refchecker.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.35
+Version: 1.2.37
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/SOURCES.txt RENAMED Viewed

@@ -42,6 +42,8 @@ src/services/pdf_processor.py
 src/utils/__init__.py
 src/utils/arxiv_utils.py
 src/utils/author_utils.py
+src/utils/biblatex_parser.py
+src/utils/bibtex_parser.py
 src/utils/config_validator.py
 src/utils/db_utils.py
 src/utils/doi_utils.py

{academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/refchecker.py RENAMED Viewed

@@ -202,6 +202,10 @@ class ArxivReferenceChecker:
         # debug mode
         self.debug_mode = debug_mode
+        # Initialize extraction flags
+        self.used_regex_extraction = False
+        self.used_unreliable_extraction = False
         # Parallel processing configuration
         self.enable_parallel = enable_parallel
         self.max_workers = max_workers
@@ -2887,6 +2891,7 @@ class ArxivReferenceChecker:
         self.total_other_refs = 0
         self.total_unverified_refs = 0
         self.used_regex_extraction = False
+        self.used_unreliable_extraction = False  # Only set for fallback regex parsing, not BibTeX
         try:
             # Get papers to process
@@ -3105,8 +3110,8 @@ class ArxivReferenceChecker:
                 if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_unverified_refs == 0:
                     print(f"✅ All references verified successfully!")
-                # Show warning if regex extraction was used and there are many errors
-                if self.used_regex_extraction and self.total_errors_found > 5:
+                # Show warning if unreliable extraction was used and there are many errors
+                if self.used_unreliable_extraction and self.total_errors_found > 5:
                     print(f"\n⚠️  Results might be affected by incorrect reference extraction. Consider using LLM extraction, which is more robust.")
                 if self.verification_output_file:
@@ -3124,8 +3129,8 @@ class ArxivReferenceChecker:
                 print(f"         Total warnings: {self.total_warnings_found}")
                 print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
-                # Show warning if regex extraction was used and there are many errors
-                if self.used_regex_extraction and self.total_errors_found > 5:
+                # Show warning if unreliable extraction was used and there are many errors
+                if self.used_unreliable_extraction and self.total_errors_found > 5:
                     print(f"\n⚠️  Results might be affected by incorrect reference extraction. Consider using LLM extraction, which is more robust.")
                 if self.verification_output_file:
@@ -3401,15 +3406,25 @@ class ArxivReferenceChecker:
         if detect_standard_acm_natbib_format(bibliography_text):
             logger.info("Detected standard ACM/natbib format, using regex-based parsing")
             self.used_regex_extraction = True
+            # Note: ACM/natbib parsing is also quite robust for standard formats
             return self._parse_standard_acm_natbib_references(bibliography_text)
         # Check if this is BibTeX format
-        from utils.text_utils import detect_bibtex_format
+        from utils.bibtex_parser import detect_bibtex_format
         if detect_bibtex_format(bibliography_text):
             logger.info("Detected BibTeX format, using BibTeX parser")
             self.used_regex_extraction = True
+            # Note: BibTeX parsing is robust, so we don't set used_unreliable_extraction
             return self._parse_bibtex_references(bibliography_text)
+        # Check if this is biblatex format
+        from utils.biblatex_parser import detect_biblatex_format
+        if detect_biblatex_format(bibliography_text):
+            logger.info("Detected biblatex format, using biblatex parser")
+            self.used_regex_extraction = True
+            # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
+            return self._parse_biblatex_references(bibliography_text)
         # For non-standard formats, try LLM-based extraction if available
         if self.llm_extractor:
             try:
@@ -3431,6 +3446,7 @@ class ArxivReferenceChecker:
         # Fallback to regex-based parsing only if LLM was not specified
         logger.info("No LLM available, falling back to regex-based parsing")
         self.used_regex_extraction = True
+        self.used_unreliable_extraction = True  # This is the unreliable fallback parsing
         return self._parse_references_regex(bibliography_text)
     def _parse_standard_acm_natbib_references(self, bibliography_text):
@@ -3622,10 +3638,22 @@ class ArxivReferenceChecker:
         self.used_regex_extraction = True
         # Check if this is BibTeX format first
-        if re.search(r'@\w+\s*\{', bibliography_text):
+        from utils.bibtex_parser import detect_bibtex_format
+        if detect_bibtex_format(bibliography_text):
             logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
+            # BibTeX parsing is robust, so we don't set used_unreliable_extraction
             return self._parse_bibtex_references(bibliography_text)
+        # Check if this is biblatex format
+        from utils.biblatex_parser import detect_biblatex_format
+        if detect_biblatex_format(bibliography_text):
+            logger.debug("Detected biblatex format, using biblatex-specific parsing")
+            # biblatex parsing is also robust, so we don't set used_unreliable_extraction
+            return self._parse_biblatex_references(bibliography_text)
+        # If we reach here, we're using the unreliable fallback regex parsing
+        self.used_unreliable_extraction = True
         # --- IMPROVED SPLITTING: handle concatenated references like [3]... [4]... ---
         # First, normalize the bibliography text to handle multi-line references
         # This fixes the issue where years appear as separate lines
@@ -4054,270 +4082,33 @@ class ArxivReferenceChecker:
         Returns:
             List of structured reference dictionaries
         """
-        import re
+        # Use the dedicated BibTeX parser
+        from utils.bibtex_parser import parse_bibtex_references
-        # Pattern to match BibTeX entries: @type{key, ...}
-        # This handles nested braces properly
-        bibtex_pattern = r'@(\w+)\s*\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}'
-        entries = []
-        for match in re.finditer(bibtex_pattern, bibliography_text, re.DOTALL | re.IGNORECASE):
-            entry_type = match.group(1).lower()
-            entry_content = match.group(2)
-            # Extract fields from the BibTeX entry
-            entry_data = self._parse_bibtex_entry(entry_type, entry_content)
-            if entry_data:
-                entries.append(entry_data)
-        if not entries:
-            # Fallback: try simpler pattern if the above doesn't work
-            logger.debug("Complex BibTeX pattern failed, trying simpler approach")
-            # Split on @word{ patterns to find entry boundaries
-            parts = re.split(r'(?=@\w+\s*\{)', bibliography_text)
-            for part in parts:
-                part = part.strip()
-                if not part or not part.startswith('@'):
-                    continue
-                # Find the entry type
-                type_match = re.match(r'@(\w+)\s*\{', part)
-                if not type_match:
-                    continue
-                entry_type = type_match.group(1).lower()
-                # Extract the content between the first { and the last }
-                # This is a simplified approach but should work for most cases
-                brace_start = part.find('{')
-                if brace_start == -1:
-                    continue
-                # Find the matching closing brace
-                brace_count = 0
-                content_end = -1
-                for i, char in enumerate(part[brace_start:], brace_start):
-                    if char == '{':
-                        brace_count += 1
-                    elif char == '}':
-                        brace_count -= 1
-                        if brace_count == 0:
-                            content_end = i
-                            break
-                if content_end == -1:
-                    # No matching brace found, take everything after first {
-                    entry_content = part[brace_start + 1:]
-                else:
-                    entry_content = part[brace_start + 1:content_end]
-                entry_data = self._parse_bibtex_entry(entry_type, entry_content)
-                if entry_data:
-                    entries.append(entry_data)
+        # Extract references using the BibTeX parser
+        references = parse_bibtex_references(bibliography_text)
-        logger.debug(f"Extracted {len(entries)} BibTeX entries")
-        return entries
+        logger.debug(f"Extracted {len(references)} BibTeX references using dedicated parser")
+        return references
-    def _parse_bibtex_entry(self, entry_type, content):
+    def _parse_biblatex_references(self, bibliography_text):
         """
-        Parse a single BibTeX entry content to extract fields
+        Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
         Args:
-            entry_type: Type of entry (inproceedings, article, etc.)
-            content: Content inside the braces
+            bibliography_text: String containing biblatex .bbl entries
         Returns:
-            Dictionary with structured reference data
+            List of structured reference dictionaries
         """
-        import re
-        from utils.text_utils import parse_authors_with_initials, clean_title
-        from utils.doi_utils import construct_doi_url, is_valid_doi_format
-        # Extract key (first part before comma)
-        key_match = re.match(r'([^,]+),', content)
-        key = key_match.group(1).strip() if key_match else ""
-        # Extract fields using regex
-        fields = {}
-        # Pattern to match field = {value} or field = "value"
-        field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
-        for match in re.finditer(field_pattern, content, re.DOTALL):
-            field_name = match.group(1).lower()
-            field_value = match.group(2) or match.group(3) or ""
-            # Strip outer quotes if present (handles cases like title = {"Some Title"})
-            field_value = field_value.strip()
-            if field_value.startswith('"') and field_value.endswith('"'):
-                field_value = field_value[1:-1]
-            fields[field_name] = field_value
-        # If field extraction failed, try a simpler approach
-        if not fields:
-            logger.debug("Field extraction failed, trying line-by-line approach")
-            lines = content.split('\n')
-            for line in lines:
-                line = line.strip()
-                if '=' in line:
-                    field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
-                    if field_match:
-                        field_name = field_match.group(1).lower()
-                        field_value = field_match.group(2).strip()
-                        # Strip outer quotes if present
-                        if field_value.startswith('"') and field_value.endswith('"'):
-                            field_value = field_value[1:-1]
-                        fields[field_name] = field_value
-        # Extract required information
-        title = fields.get('title', '')
-        author_string = fields.get('author', '')
-        year = 0
-        # Parse year
-        year_str = fields.get('year', '')
-        if year_str:
-            year_match = re.search(r'\d{4}', year_str)
-            if year_match:
-                year = int(year_match.group())
-        # If no year found but we have a valid title/authors, try extracting from eprint or other fields
-        if year == 0 and (title or author_string):
-            # Check eprint field for arXiv entries like "2024" prefix
-            eprint = fields.get('eprint', '')
-            if eprint:
-                # Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
-                eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
-                if eprint_year_match:
-                    yy = int(eprint_year_match.group(1))
-                    # Convert to 4-digit year (23 -> 2023, assumes 21st century)
-                    if yy >= 91:  # ArXiv started in 1991
-                        year = 1900 + yy
-                    else:
-                        year = 2000 + yy
-        # For entries without year, set None instead of 0
-        if year == 0:
-            year = None
-        # Parse authors using the enhanced function
-        authors = []
-        if author_string:
-            try:
-                authors = parse_authors_with_initials(author_string)
-            except Exception as e:
-                logger.debug(f"Author parsing failed for '{author_string}': {e}")
-                # Fallback: split by 'and' and clean up
-                author_parts = author_string.split(' and ')
-                authors = []
-                for part in author_parts:
-                    # Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
-                    part = re.sub(r'^and\s+', '', part.strip())
-                    if part:
-                        authors.append(part)
-        # Special handling for @misc entries with only howpublished field
-        if not title and not authors and entry_type == 'misc':
-            howpublished = fields.get('howpublished', '')
-            if howpublished:
-                # Try to extract a URL from howpublished
-                url_patterns = [
-                    r'://([^/]+)',  # Missing protocol case: "://example.com/path"
-                    r'https?://([^/\s]+)',  # Standard URL
-                    r'www\.([^/\s]+)',  # www without protocol
-                ]
-                extracted_url = ''
-                for pattern in url_patterns:
-                    match = re.search(pattern, howpublished)
-                    if match:
-                        domain = match.group(1)
-                        # Reconstruct URL with https if protocol was missing
-                        if howpublished.startswith('://'):
-                            extracted_url = 'https' + howpublished
-                        elif not howpublished.startswith(('http://', 'https://')):
-                            extracted_url = 'https://' + howpublished
-                        else:
-                            extracted_url = howpublished
-                        # Generate title from domain/path
-                        if 'jailbreakchat.com' in domain:
-                            title = 'JailbreakChat Website'
-                        elif 'lesswrong.com' in domain:
-                            title = 'LessWrong Post: Jailbreaking ChatGPT'
-                        elif 'chat.openai.com' in domain:
-                            title = 'ChatGPT Conversation Share'
-                        elif 'gemini.google.com' in domain:
-                            title = 'Gemini Conversation Share'
-                        elif 'microsoft.com' in domain:
-                            title = 'Microsoft Azure Content Safety API'
-                        elif 'perspectiveapi.com' in domain:
-                            title = 'Perspective API'
-                        else:
-                            # Generic title based on domain
-                            title = f"Web Resource: {domain}"
-                        authors = ["Web Resource"]
-                        # Store the extracted URL
-                        fields['url'] = extracted_url
-                        break
-        # Apply defaults only if we still don't have values
-        if not authors:
-            authors = ["Unknown Author"]
-        # Clean title
-        title = clean_title(title) if title else "Unknown Title"
-        # Extract URL/DOI
-        url = fields.get('url', '')
-        doi = fields.get('doi', '')
-        # Construct DOI URL if we have a DOI
-        if doi and is_valid_doi_format(doi):
-            url = construct_doi_url(doi)
-        # Construct ArXiv URL from eprint field if no URL present
-        if not url:
-            eprint = fields.get('eprint', '')
-            if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
-                # Remove version number if present and construct ArXiv URL
-                clean_eprint = re.sub(r'v\d+$', '', eprint)
-                url = f"https://arxiv.org/abs/{clean_eprint}"
-        # Handle special URL fields
-        if not url:
-            howpublished = fields.get('howpublished', '')
-            if 'url{' in howpublished or 'href{' in howpublished:
-                url_match = re.search(r'url\{([^}]+)\}', howpublished)
-                if not url_match:
-                    url_match = re.search(r'href\{([^}]+)\}', howpublished)
-                if url_match:
-                    from utils.url_utils import clean_url_punctuation
-                    url = clean_url_punctuation(url_match.group(1))
+        # Use the dedicated biblatex parser
+        from utils.biblatex_parser import parse_biblatex_references
-        # Determine reference type
-        ref_type = 'other'
-        if 'arxiv' in url.lower() or 'arxiv' in title.lower():
-            ref_type = 'arxiv'
-        elif url or doi:
-            ref_type = 'non-arxiv'
-        # Create structured reference
-        structured_ref = {
-            'url': url,
-            'doi': doi,
-            'year': year,
-            'authors': authors,
-            'title': title,
-            'raw_text': f"@{entry_type}{{{key}, {content}}}",
-            'type': ref_type,
-            'bibtex_key': key,
-            'bibtex_type': entry_type
-        }
+        # Extract references using the biblatex parser
+        references = parse_biblatex_references(bibliography_text)
-        logger.debug(f"Parsed BibTeX entry: {title} by {authors} ({year})")
-        return structured_ref
+        logger.debug(f"Extracted {len(references)} biblatex references using dedicated parser")
+        return references
     def _process_llm_extracted_references(self, references):
         """
@@ -4327,7 +4118,6 @@ class ArxivReferenceChecker:
         unique_references = self._deduplicate_references_with_segment_matching(references)
         logger.debug(f"Deduplicated {len(references)} references to {len(unique_references)} unique references")
-        logger.info(f"Extracted {len(unique_references)} references using LLM")
         processed_refs = []
@@ -5032,8 +4822,7 @@ class ArxivReferenceChecker:
                 from utils.text_utils import detect_latex_bibliography_format
                 latex_format = detect_latex_bibliography_format(tex_content)
                 if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
-                    logger.info(f"Found embedded bibliography in ArXiv LaTeX source for {arxiv_id}, but skipping due to formatting issues")
-                    logger.info(f"Embedded bibliographies often have inconsistent formatting - falling back to alternative extraction methods")
+                    logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
                     # Skip embedded bibliography and return None to trigger fallback methods
                     return None
@@ -5075,6 +4864,40 @@ class ArxivReferenceChecker:
                 logger.info(f"Detected LaTeX thebibliography format, using extract_latex_references")
                 # Use None for file_path since this is content from .bbl files
                 references = extract_latex_references(bibtex_content, None)
+                # Validate the parsed references and fallback to LLM if needed
+                from utils.text_utils import validate_parsed_references
+                validation = validate_parsed_references(references)
+                if not validation['is_valid']:
+                    logger.debug(f"LaTeX parsing validation failed (quality: {validation['quality_score']:.2f})")
+                    logger.debug(f"Issues detected: {len(validation['issues'])} problems")
+                    for issue in validation['issues'][:5]:  # Log first 5 issues
+                        logger.debug(f"  - {issue}")
+                    # Try LLM fallback if available
+                    if self.llm_extractor:
+                        logger.info("Falling back to LLM-based extraction due to unsupported LaTeX format")
+                        try:
+                            llm_references = self.llm_extractor.extract_references(bibtex_content)
+                            if llm_references:
+                                # Process LLM results first to get structured references
+                                processed_llm_refs = self._process_llm_extracted_references(llm_references)
+                                # Then validate the processed results
+                                llm_validation = validate_parsed_references(processed_llm_refs)
+                                if llm_validation['quality_score'] > validation['quality_score']:
+                                    logger.debug(f"LLM extraction successful (quality: {llm_validation['quality_score']:.2f})")
+                                    references = processed_llm_refs
+                                else:
+                                    logger.debug("LLM extraction didn't improve quality, keeping original results")
+                            else:
+                                logger.warning("LLM extraction returned no results")
+                        except Exception as e:
+                            logger.error(f"LLM fallback failed: {e}")
+                    else:
+                        logger.warning("No LLM available for fallback, using original parsing results")
+                else:
+                    logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
             else:
                 # Parse BibTeX using the standard flow (LLM or regex based on config)
                 references = self.parse_references(bibtex_content)
@@ -5088,7 +4911,7 @@ class ArxivReferenceChecker:
                     logger.warning(f"Could not save debug references file for {paper_id}: {e}")
             if references:
-                logger.info(f"Successfully extracted {len(references)} references from BibTeX for {paper_id}")
+                logger.debug(f"Extracted {len(references)} references")
                 return references
         # Check if this is a text file containing references
@@ -5158,7 +4981,7 @@ class ArxivReferenceChecker:
                 bibtex_references = extract_latex_references(bib_content, paper.file_path)
                 if bibtex_references:
-                    logger.info(f"Extracted {len(bibtex_references)} references from BibTeX file")
+                    logger.debug(f"Extracted {len(bibtex_references)} references from BibTeX file")
                     return bibtex_references
                 else:
                     logger.warning(f"No references found in BibTeX file: {paper.file_path}")

academic-refchecker 1.2.35__tar.gz → 1.2.37__tar.gz

academic-refchecker 1.2.35tar.gz → 1.2.37tar.gz