PyPI - academic-refchecker - Versions diffs - 1.2.36__py3-none-any.whl → 1.2.38__py3-none-any.whl - Mend

academic-refchecker 1.2.36py3-none-any.whl → 1.2.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

__version__.py +1 -1
{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/METADATA +1 -1
{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/RECORD +14 -11
core/refchecker.py +56 -299
utils/arxiv_utils.py +77 -1
utils/biblatex_parser.py +485 -0
utils/bibliography_utils.py +332 -0
utils/bibtex_parser.py +334 -0
utils/text_utils.py +72 -183
utils/url_utils.py +29 -12
{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/WHEEL +0 -0
{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/entry_points.txt +0 -0
{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/licenses/LICENSE +0 -0
{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/top_level.txt +0 -0

__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information for RefChecker."""
-__version__ = "1.2.36"
+__version__ = "1.2.38"

{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: academic-refchecker
-Version: 1.2.36
+Version: 1.2.38
 Summary: A comprehensive tool for validating reference accuracy in academic papers
 Author-email: Mark Russinovich <markrussinovich@hotmail.com>
 License-Expression: MIT

{academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-__version__.py,sha256=f5GV2gIZ9QIGgMwt7IOmntIX6y7h7w-oSwi4Dr7pgSQ,65
-academic_refchecker-1.2.36.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
+__version__.py,sha256=9ez-UBx1mkgUvDMk-z63_XpqOh2QnPCeTrDEuricP1w,65
+academic_refchecker-1.2.38.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
 checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
 checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
 checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
 core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
 core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
 core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
-core/refchecker.py,sha256=UDyr1PVdMWHZFXhmNexeQ4OVEtI-BYbmE9G3P-u4G_4,283915
+core/refchecker.py,sha256=8EatAqYEDpW219Xrn-ql1oQ5ytmCU8RW8pMtlujRbC8,273167
 database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
 database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
 llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,18 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
 services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
 services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
 utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
-utils/arxiv_utils.py,sha256=UBxgLQEzbZ2lrUc6uA0qvnm-glRcSsnBdbz6y0IMWek,14754
+utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
 utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
+utils/biblatex_parser.py,sha256=Vznt-BfNtQQb4XQ6iPab2CgFcV2JIjva1OU33NzQ51g,20253
+utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
+utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
 utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
 utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
 utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
 utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
 utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
-utils/text_utils.py,sha256=dWVH6s_-sor3HN3tsF6KoceFcUhXUHNMSKyk4LywgRg,193658
+utils/text_utils.py,sha256=KjNx_UJvVhz-oowu4CCdryEuN0hYLu4X8yVkjdYP8fM,189261
 utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
-utils/url_utils.py,sha256=qoimCrMFCBGvlmF_t1c6zSOmkWi_rUm-gZM0XZ4rEVE,6291
-academic_refchecker-1.2.36.dist-info/METADATA,sha256=bxN2DXEeHJeY8_CQ2FYke-E1v67Fbn8jInjStRxe0wg,22298
-academic_refchecker-1.2.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-academic_refchecker-1.2.36.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
-academic_refchecker-1.2.36.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
-academic_refchecker-1.2.36.dist-info/RECORD,,
+utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
+academic_refchecker-1.2.38.dist-info/METADATA,sha256=7V0yEKZy9zao6s3_TBHPOg7Gi86h4lG2m_rhyhStq5w,22298
+academic_refchecker-1.2.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+academic_refchecker-1.2.38.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
+academic_refchecker-1.2.38.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
+academic_refchecker-1.2.38.dist-info/RECORD,,

core/refchecker.py CHANGED Viewed

@@ -451,47 +451,10 @@ class ArxivReferenceChecker:
     def extract_arxiv_id_from_url(self, url):
         """
-        Extract ArXiv ID from a URL or text containing ArXiv reference
+        Extract ArXiv ID from a URL or text containing ArXiv reference.
+        Uses the common extraction function from utils.url_utils.
         """
-        if not url:
-            return None
-        # First, check for arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
-        arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
-        if arxiv_match:
-            arxiv_id = arxiv_match.group(1)
-            # Remove version number if present
-            arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
-            return arxiv_id
-        # Remove version string from end if present (e.g., 'v1')
-        url = re.sub(r'v\d+$', '', url)
-        # Parse URL
-        parsed_url = urlparse(url)
-        # Check if it's an arxiv.org URL
-        if 'arxiv.org' in parsed_url.netloc:
-            # Extract ID from path
-            path = parsed_url.path.strip('/')
-            # Handle different URL formats
-            if path.startswith('abs/'):
-                arxiv_id = path.replace('abs/', '')
-            elif path.startswith('pdf/'):
-                arxiv_id = path.replace('pdf/', '').replace('.pdf', '')
-            elif '/abs/' in path:
-                arxiv_id = path.split('/abs/')[1]
-            elif '/pdf/' in path:
-                arxiv_id = path.split('/pdf/')[1].replace('.pdf', '')
-            else:
-                arxiv_id = path
-            # Remove version number from the extracted ID
-            arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
-            return arxiv_id
-        return None
+        return extract_arxiv_id_from_url(url)
     def get_paper_metadata(self, arxiv_id):
         """
@@ -3410,13 +3373,21 @@ class ArxivReferenceChecker:
             return self._parse_standard_acm_natbib_references(bibliography_text)
         # Check if this is BibTeX format
-        from utils.text_utils import detect_bibtex_format
+        from utils.bibtex_parser import detect_bibtex_format
         if detect_bibtex_format(bibliography_text):
             logger.info("Detected BibTeX format, using BibTeX parser")
             self.used_regex_extraction = True
             # Note: BibTeX parsing is robust, so we don't set used_unreliable_extraction
             return self._parse_bibtex_references(bibliography_text)
+        # Check if this is biblatex format
+        from utils.biblatex_parser import detect_biblatex_format
+        if detect_biblatex_format(bibliography_text):
+            logger.info("Detected biblatex format, using biblatex parser")
+            self.used_regex_extraction = True
+            # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
+            return self._parse_biblatex_references(bibliography_text)
         # For non-standard formats, try LLM-based extraction if available
         if self.llm_extractor:
             try:
@@ -3573,11 +3544,9 @@ class ArxivReferenceChecker:
                 # Clean author part and extract authors
                 author_part_clean = strip_latex_commands(author_part).strip()
                 if author_part_clean and not author_part_clean.startswith('\\'):
-                    # Parse author names - handle comma-separated list and "and"
-                    if ', and ' in author_part_clean:
-                        author_names = re.split(r', and |, ', author_part_clean)
-                    else:
-                        author_names = [name.strip() for name in author_part_clean.split(',')]
+                    # Parse author names using the robust author parsing function
+                    from utils.text_utils import parse_authors_with_initials
+                    author_names = parse_authors_with_initials(author_part_clean)
                     # Clean up author names
                     authors = []
@@ -3630,11 +3599,19 @@ class ArxivReferenceChecker:
         self.used_regex_extraction = True
         # Check if this is BibTeX format first
-        if re.search(r'@\w+\s*\{', bibliography_text):
+        from utils.bibtex_parser import detect_bibtex_format
+        if detect_bibtex_format(bibliography_text):
             logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
             # BibTeX parsing is robust, so we don't set used_unreliable_extraction
             return self._parse_bibtex_references(bibliography_text)
+        # Check if this is biblatex format
+        from utils.biblatex_parser import detect_biblatex_format
+        if detect_biblatex_format(bibliography_text):
+            logger.debug("Detected biblatex format, using biblatex-specific parsing")
+            # biblatex parsing is also robust, so we don't set used_unreliable_extraction
+            return self._parse_biblatex_references(bibliography_text)
         # If we reach here, we're using the unreliable fallback regex parsing
         self.used_unreliable_extraction = True
@@ -4066,214 +4043,33 @@ class ArxivReferenceChecker:
         Returns:
             List of structured reference dictionaries
         """
-        # Use the improved BibTeX parsing from text_utils
-        from utils.text_utils import extract_latex_references
+        # Use the dedicated BibTeX parser
+        from utils.bibtex_parser import parse_bibtex_references
-        # Extract references using the improved parsing logic
-        references = extract_latex_references(bibliography_text)
+        # Extract references using the BibTeX parser
+        references = parse_bibtex_references(bibliography_text)
-        logger.debug(f"Extracted {len(references)} BibTeX references using improved parser")
+        logger.debug(f"Extracted {len(references)} BibTeX references using dedicated parser")
         return references
-    def _parse_bibtex_entry(self, entry_type, content):
+    def _parse_biblatex_references(self, bibliography_text):
         """
-        Parse a single BibTeX entry content to extract fields
+        Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
         Args:
-            entry_type: Type of entry (inproceedings, article, etc.)
-            content: Content inside the braces
+            bibliography_text: String containing biblatex .bbl entries
         Returns:
-            Dictionary with structured reference data
+            List of structured reference dictionaries
         """
-        import re
-        from utils.text_utils import parse_authors_with_initials, clean_title
-        from utils.doi_utils import construct_doi_url, is_valid_doi_format
-        # Extract key (first part before comma)
-        key_match = re.match(r'([^,]+),', content)
-        key = key_match.group(1).strip() if key_match else ""
-        # Extract fields using regex
-        fields = {}
-        # Pattern to match field = {value} or field = "value"
-        field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
-        for match in re.finditer(field_pattern, content, re.DOTALL):
-            field_name = match.group(1).lower()
-            field_value = match.group(2) or match.group(3) or ""
-            # Strip outer quotes if present (handles cases like title = {"Some Title"})
-            field_value = field_value.strip()
-            if field_value.startswith('"') and field_value.endswith('"'):
-                field_value = field_value[1:-1]
-            fields[field_name] = field_value
-        # If field extraction failed, try a simpler approach
-        if not fields:
-            logger.debug("Field extraction failed, trying line-by-line approach")
-            lines = content.split('\n')
-            for line in lines:
-                line = line.strip()
-                if '=' in line:
-                    field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
-                    if field_match:
-                        field_name = field_match.group(1).lower()
-                        field_value = field_match.group(2).strip()
-                        # Strip outer quotes if present
-                        if field_value.startswith('"') and field_value.endswith('"'):
-                            field_value = field_value[1:-1]
-                        fields[field_name] = field_value
-        # Extract required information
-        title = fields.get('title', '')
-        author_string = fields.get('author', '')
-        year = 0
-        # Parse year
-        year_str = fields.get('year', '')
-        if year_str:
-            year_match = re.search(r'\d{4}', year_str)
-            if year_match:
-                year = int(year_match.group())
-        # If no year found but we have a valid title/authors, try extracting from eprint or other fields
-        if year == 0 and (title or author_string):
-            # Check eprint field for arXiv entries like "2024" prefix
-            eprint = fields.get('eprint', '')
-            if eprint:
-                # Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
-                eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
-                if eprint_year_match:
-                    yy = int(eprint_year_match.group(1))
-                    # Convert to 4-digit year (23 -> 2023, assumes 21st century)
-                    if yy >= 91:  # ArXiv started in 1991
-                        year = 1900 + yy
-                    else:
-                        year = 2000 + yy
-        # For entries without year, set None instead of 0
-        if year == 0:
-            year = None
-        # Parse authors using the enhanced function
-        authors = []
-        if author_string:
-            try:
-                authors = parse_authors_with_initials(author_string)
-            except Exception as e:
-                logger.debug(f"Author parsing failed for '{author_string}': {e}")
-                # Fallback: split by 'and' and clean up
-                author_parts = author_string.split(' and ')
-                authors = []
-                for part in author_parts:
-                    # Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
-                    part = re.sub(r'^and\s+', '', part.strip())
-                    if part:
-                        authors.append(part)
-        # Special handling for @misc entries with only howpublished field
-        if not title and not authors and entry_type == 'misc':
-            howpublished = fields.get('howpublished', '')
-            if howpublished:
-                # Try to extract a URL from howpublished
-                url_patterns = [
-                    r'://([^/]+)',  # Missing protocol case: "://example.com/path"
-                    r'https?://([^/\s]+)',  # Standard URL
-                    r'www\.([^/\s]+)',  # www without protocol
-                ]
-                extracted_url = ''
-                for pattern in url_patterns:
-                    match = re.search(pattern, howpublished)
-                    if match:
-                        domain = match.group(1)
-                        # Reconstruct URL with https if protocol was missing
-                        if howpublished.startswith('://'):
-                            extracted_url = 'https' + howpublished
-                        elif not howpublished.startswith(('http://', 'https://')):
-                            extracted_url = 'https://' + howpublished
-                        else:
-                            extracted_url = howpublished
-                        # Generate title from domain/path
-                        if 'jailbreakchat.com' in domain:
-                            title = 'JailbreakChat Website'
-                        elif 'lesswrong.com' in domain:
-                            title = 'LessWrong Post: Jailbreaking ChatGPT'
-                        elif 'chat.openai.com' in domain:
-                            title = 'ChatGPT Conversation Share'
-                        elif 'gemini.google.com' in domain:
-                            title = 'Gemini Conversation Share'
-                        elif 'microsoft.com' in domain:
-                            title = 'Microsoft Azure Content Safety API'
-                        elif 'perspectiveapi.com' in domain:
-                            title = 'Perspective API'
-                        else:
-                            # Generic title based on domain
-                            title = f"Web Resource: {domain}"
-                        authors = ["Web Resource"]
-                        # Store the extracted URL
-                        fields['url'] = extracted_url
-                        break
+        # Use the dedicated biblatex parser
+        from utils.biblatex_parser import parse_biblatex_references
-        # Apply defaults only if we still don't have values
-        if not authors:
-            authors = ["Unknown Author"]
-        # Clean title
-        title = clean_title(title) if title else "Unknown Title"
-        # Extract URL/DOI
-        url = fields.get('url', '')
-        doi = fields.get('doi', '')
-        # Construct DOI URL if we have a DOI
-        if doi and is_valid_doi_format(doi):
-            url = construct_doi_url(doi)
-        # Construct ArXiv URL from eprint field if no URL present
-        if not url:
-            eprint = fields.get('eprint', '')
-            if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
-                # Remove version number if present and construct ArXiv URL
-                clean_eprint = re.sub(r'v\d+$', '', eprint)
-                url = f"https://arxiv.org/abs/{clean_eprint}"
-        # Handle special URL fields
-        if not url:
-            howpublished = fields.get('howpublished', '')
-            if 'url{' in howpublished or 'href{' in howpublished:
-                url_match = re.search(r'url\{([^}]+)\}', howpublished)
-                if not url_match:
-                    url_match = re.search(r'href\{([^}]+)\}', howpublished)
-                if url_match:
-                    from utils.url_utils import clean_url_punctuation
-                    url = clean_url_punctuation(url_match.group(1))
-        # Determine reference type
-        ref_type = 'other'
-        if 'arxiv' in url.lower() or 'arxiv' in title.lower():
-            ref_type = 'arxiv'
-        elif url or doi:
-            ref_type = 'non-arxiv'
-        # Create structured reference
-        structured_ref = {
-            'url': url,
-            'doi': doi,
-            'year': year,
-            'authors': authors,
-            'title': title,
-            'raw_text': f"@{entry_type}{{{key}, {content}}}",
-            'type': ref_type,
-            'bibtex_key': key,
-            'bibtex_type': entry_type
-        }
+        # Extract references using the biblatex parser
+        references = parse_biblatex_references(bibliography_text)
-        logger.debug(f"Parsed BibTeX entry: {title} by {authors} ({year})")
-        return structured_ref
+        logger.debug(f"Extracted {len(references)} biblatex references using dedicated parser")
+        return references
     def _process_llm_extracted_references(self, references):
         """
@@ -4429,8 +4225,17 @@ class ArxivReferenceChecker:
             return True
         # Also check if authors have significant overlap (at least 50% of the shorter author list)
-        author1_parts = seg1['author'].split('*') if '*' in seg1['author'] else seg1['author'].split(',')
-        author2_parts = seg2['author'].split('*') if '*' in seg2['author'] else seg2['author'].split(',')
+        from utils.text_utils import parse_authors_with_initials
+        if '*' in seg1['author']:
+            author1_parts = seg1['author'].split('*')
+        else:
+            author1_parts = parse_authors_with_initials(seg1['author'])
+        if '*' in seg2['author']:
+            author2_parts = seg2['author'].split('*')
+        else:
+            author2_parts = parse_authors_with_initials(seg2['author'])
         # Clean and normalize author names
         author1_clean = {a.strip().lower() for a in author1_parts if a.strip() and a.strip() not in ['et al', 'others']}
@@ -4945,55 +4750,6 @@ class ArxivReferenceChecker:
         }
-    def _get_bibtex_content(self, paper):
-        """
-        Try to get BibTeX content for a paper from various sources.
-        Args:
-            paper: Paper object
-        Returns:
-            str: BibTeX content if found, None otherwise
-        """
-        # Try ArXiv source if it's an ArXiv paper
-        from utils.arxiv_utils import extract_arxiv_id_from_paper, download_arxiv_source
-        arxiv_id = extract_arxiv_id_from_paper(paper)
-        if arxiv_id:
-            logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
-            tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
-            # Prefer .bib files (most structured), then .bbl files
-            if bib_content:
-                logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
-                # If we have LaTeX content, filter BibTeX by cited keys
-                if tex_content:
-                    from utils.text_utils import extract_cited_keys_from_latex, filter_bibtex_by_cited_keys
-                    cited_keys = extract_cited_keys_from_latex(tex_content)
-                    if cited_keys:
-                        logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
-                        filtered_content = filter_bibtex_by_cited_keys(bib_content, cited_keys)
-                        return filtered_content
-                return bib_content
-            elif bbl_content:
-                logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
-                return bbl_content
-            elif tex_content:
-                # Check for embedded bibliography in LaTeX
-                from utils.text_utils import detect_latex_bibliography_format
-                latex_format = detect_latex_bibliography_format(tex_content)
-                if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
-                    logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
-                    # Skip embedded bibliography and return None to trigger fallback methods
-                    return None
-        # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
-        return None
     def extract_bibliography(self, paper, debug_mode=False):
@@ -5008,7 +4764,8 @@ class ArxivReferenceChecker:
         logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
         # Check if we can get BibTeX content for this paper (ArXiv or other sources)
-        bibtex_content = self._get_bibtex_content(paper)
+        from utils.arxiv_utils import get_bibtex_content
+        bibtex_content = get_bibtex_content(paper)
         if bibtex_content:
             logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
@@ -5062,7 +4819,7 @@ class ArxivReferenceChecker:
                     else:
                         logger.warning("No LLM available for fallback, using original parsing results")
                 else:
-                    logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
+                    logger.debug(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
             else:
                 # Parse BibTeX using the standard flow (LLM or regex based on config)
                 references = self.parse_references(bibtex_content)
@@ -5076,7 +4833,7 @@ class ArxivReferenceChecker:
                     logger.warning(f"Could not save debug references file for {paper_id}: {e}")
             if references:
-                logger.info(f"Extracted {len(references)} references")
+                logger.debug(f"Extracted {len(references)} references")
                 return references
         # Check if this is a text file containing references
@@ -5146,7 +4903,7 @@ class ArxivReferenceChecker:
                 bibtex_references = extract_latex_references(bib_content, paper.file_path)
                 if bibtex_references:
-                    logger.info(f"Extracted {len(bibtex_references)} references from BibTeX file")
+                    logger.debug(f"Extracted {len(bibtex_references)} references from BibTeX file")
                     return bibtex_references
                 else:
                     logger.warning(f"No references found in BibTeX file: {paper.file_path}")
@@ -5623,7 +5380,7 @@ class ArxivReferenceChecker:
                 error_details = unverified_errors[0].get('error_details', '')
                 if error_details:
                     subreason = self._categorize_unverified_reason(error_details)
-                    print(f"          Subreason: {subreason}")
+                    print(f"         Subreason: {subreason}")
             year_str = self._format_year_string(reference.get('year'))

utils/arxiv_utils.py CHANGED Viewed

@@ -288,7 +288,7 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
             return bib_content
         # Parse BibTeX entries and filter
-        from utils.text_utils import parse_bibtex_entries
+        from utils.bibtex_parser import parse_bibtex_entries
         entries = parse_bibtex_entries(bib_content)
         # Filter entries to only cited ones
@@ -374,3 +374,79 @@ def reconstruct_bibtex_content(cited_entries, original_content):
     return '\n\n'.join(filtered_parts) + '\n'
+def get_bibtex_content(paper):
+    """
+    Try to get BibTeX content for a paper from various sources.
+    Args:
+        paper: Paper object
+    Returns:
+        str: BibTeX content if found, None otherwise
+    """
+    import re
+    # Try ArXiv source if it's an ArXiv paper
+    arxiv_id = extract_arxiv_id_from_paper(paper)
+    if arxiv_id:
+        logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
+        tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
+        # Choose between .bib and .bbl files based on content richness
+        # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
+        if bib_content and bbl_content:
+            # Count entries in both
+            bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
+            bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
+            # If we have LaTeX content, get filtered BibTeX count
+            filtered_bib_count = bib_entry_count
+            filtered_content = bib_content
+            if tex_content:
+                cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                if cited_keys:
+                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                    filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
+            logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
+            # Prioritize .bbl if it has significantly more entries
+            if bbl_entry_count > filtered_bib_count * 1.5:  # 50% more entries threshold
+                logger.info(f"Using .bbl files from ArXiv source")
+                return bbl_content
+            else:
+                logger.info(f"Using filtered .bib files")
+                return filtered_content
+        elif bib_content:
+            logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
+            # If we have LaTeX content, filter BibTeX by cited keys
+            if tex_content:
+                cited_keys = extract_cited_keys_from_tex({}, tex_content)
+                if cited_keys:
+                    logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
+                    filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
+                    return filtered_content
+            return bib_content
+        elif bbl_content:
+            logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
+            return bbl_content
+        elif tex_content:
+            # Check for embedded bibliography in LaTeX
+            from utils.text_utils import detect_latex_bibliography_format
+            latex_format = detect_latex_bibliography_format(tex_content)
+            if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
+                logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
+                # Skip embedded bibliography and return None to trigger fallback methods
+                return None
+    # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
+    return None

academic-refchecker 1.2.36__py3-none-any.whl → 1.2.38__py3-none-any.whl

academic-refchecker 1.2.36py3-none-any.whl → 1.2.38py3-none-any.whl