academic-refchecker 2.0.8__py3-none-any.whl → 2.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 2.0.8
3
+ Version: 2.0.10
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,10 +1,10 @@
1
- academic_refchecker-2.0.8.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ academic_refchecker-2.0.10.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
2
2
  backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
3
3
  backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
4
4
  backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
5
5
  backend/concurrency.py,sha256=2KY9I_8dDkyl_HTGx27ZxU4rFXx2vqbGOlo5RrRbPjA,3223
6
6
  backend/database.py,sha256=1jLP1m9vNk5sEs4bh_xmX0T5ilZkUTX1c7nOVz5XnNc,30681
7
- backend/main.py,sha256=2ziCLwEmvPPtSiF6nuh2az2Lqg8JI9PytKWiow1V-4M,54586
7
+ backend/main.py,sha256=cenE0Vxleh1LP45EOUqh4FTCXCS0OXbPOYWxaOLMfGE,54778
8
8
  backend/models.py,sha256=El2F-RTHgxQ7-WODmiYCpjsTFDpjwF9PBt-JDa_XipE,2591
9
9
  backend/refchecker_wrapper.py,sha256=ZOg5Rc0Mgac3ALwxA55pTCeqCL06AWOBZLQeTeZEJcY,52038
10
10
  backend/thumbnail.py,sha256=wPFXp3RlmcL9jVKZmSBRB7Pfy9Ti7nCnzNtL4osfNtM,17618
@@ -16,7 +16,7 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
16
16
  backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
17
17
  refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
18
18
  refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
19
- refchecker/__version__.py,sha256=Zjb1PH2--VphovcG6srpeLZmZ4Kukc7voiH8Phuvx7c,65
19
+ refchecker/__version__.py,sha256=-Z5Qa0W7m3Azi2xuo3NQNPvyofIq7M771Vvd2YjQ1-4,66
20
20
  refchecker/checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
21
21
  refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
22
22
  refchecker/checkers/enhanced_hybrid_checker.py,sha256=2jIeUX7hankPok3M4de9o2bsJZ17ZomuLkdfdr9EV0s,28671
@@ -54,11 +54,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
54
54
  refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
55
55
  refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
56
56
  refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
57
- refchecker/utils/text_utils.py,sha256=v5beDt_fyx4ETfTXLYrDMp3CuUGoDoLs7-d1H2GdySE,228585
57
+ refchecker/utils/text_utils.py,sha256=ZIdvP75F_4o_p2lB24CkuX_eEjB9x-BY2FlXsOiYjkQ,234082
58
58
  refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
59
59
  refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
60
- academic_refchecker-2.0.8.dist-info/METADATA,sha256=-hTJhL3BwqS2hvFrvt3AKnNqIncD9wU3ltrJxdsK1F0,26575
61
- academic_refchecker-2.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
62
- academic_refchecker-2.0.8.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
63
- academic_refchecker-2.0.8.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
64
- academic_refchecker-2.0.8.dist-info/RECORD,,
60
+ academic_refchecker-2.0.10.dist-info/METADATA,sha256=1cTa4-OOQW4VJHJbGuc9OWStOx9j8qKRKF0iDhe7vbk,26576
61
+ academic_refchecker-2.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
62
+ academic_refchecker-2.0.10.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
63
+ academic_refchecker-2.0.10.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
64
+ academic_refchecker-2.0.10.dist-info/RECORD,,
backend/main.py CHANGED
@@ -13,6 +13,7 @@ from fastapi.responses import FileResponse, HTMLResponse
13
13
  from fastapi.staticfiles import StaticFiles
14
14
  from pydantic import BaseModel
15
15
  import logging
16
+ from refchecker.__version__ import __version__
16
17
 
17
18
  import aiosqlite
18
19
  from .database import db
@@ -136,6 +137,12 @@ async def health():
136
137
  return {"status": "healthy"}
137
138
 
138
139
 
140
+ @app.get("/api/version")
141
+ async def version():
142
+ """Return server/CLI version from refchecker package."""
143
+ return {"version": __version__}
144
+
145
+
139
146
  @app.websocket("/api/ws/{session_id}")
140
147
  async def websocket_endpoint(websocket: WebSocket, session_id: str):
141
148
  """WebSocket endpoint for real-time updates"""
refchecker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.8"
3
+ __version__ = "2.0.10"
@@ -3095,6 +3095,35 @@ def validate_parsed_references(references):
3095
3095
  }
3096
3096
 
3097
3097
 
3098
+ def is_access_note(text):
3099
+ """
3100
+ Check if text is an access note like '[Online; accessed DD-MM-YYYY]' or '[Accessed: YYYY-MM-DD]'
3101
+ These should not be treated as titles or venues.
3102
+
3103
+ Args:
3104
+ text: Text to check
3105
+
3106
+ Returns:
3107
+ True if text appears to be an access/retrieval note
3108
+ """
3109
+ if not text:
3110
+ return False
3111
+ text_clean = text.strip().rstrip('.')
3112
+ # Common patterns for access notes
3113
+ access_patterns = [
3114
+ r'^\[Online;?\s*accessed\s+[\d\-/]+\]$', # [Online; accessed 07-12-2024]
3115
+ r'^\[Accessed:?\s+[\d\-/]+\]$', # [Accessed: 2024-07-12]
3116
+ r'^\[Online\]$', # [Online]
3117
+ r'^\[accessed\s+[\d\-/]+\]$', # [accessed 07-12-2024]
3118
+ r'^\[Online,?\s+accessed\s+[\d\-/]+\]$', # [Online, accessed 07-12-2024]
3119
+ r'^Online;\s*accessed\s+[\d\-/]+$', # Online; accessed 07-12-2024 (without brackets)
3120
+ ]
3121
+ for pattern in access_patterns:
3122
+ if re.match(pattern, text_clean, re.IGNORECASE):
3123
+ return True
3124
+ return False
3125
+
3126
+
3098
3127
  def extract_latex_references(text, file_path=None): # pylint: disable=unused-argument
3099
3128
  """
3100
3129
  Extract references from LaTeX content programmatically
@@ -3220,191 +3249,244 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3220
3249
  # Clean and extract authors
3221
3250
  author_part_clean = strip_latex_commands(author_part).strip()
3222
3251
 
3223
- # Simple fix: just improve the organization detection without complex parsing
3224
- # Remove year pattern first - handle both parenthetical and standalone years
3225
- author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
3226
- author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
3252
+ # Special case: Check if second part is just an access note like [Online; accessed ...]
3253
+ # This indicates the reference has no authors, and the first part is actually the title
3254
+ # e.g., "The caida anonymized internet traces.\n\newblock [Online; accessed 07-12-2024]."
3255
+ first_part_is_title = False
3256
+ if len(parts) >= 2:
3257
+ second_part_clean = strip_latex_commands(parts[1]).strip()
3258
+ if is_access_note(second_part_clean):
3259
+ first_part_is_title = True
3260
+ # Use first part as title, not authors
3261
+ title_text_from_first = author_part_clean.rstrip('.')
3262
+ if title_text_from_first and len(title_text_from_first) > 5:
3263
+ ref['title'] = title_text_from_first
3264
+ # Don't set authors - this reference has none (or just a dataset name)
3265
+
3266
+ if first_part_is_title:
3267
+ # Skip normal author/title parsing - already handled above
3268
+ pass
3269
+ else:
3270
+ # Normal case: first part contains authors
3271
+ # Simple fix: just improve the organization detection without complex parsing
3272
+ # Remove year pattern first - handle both parenthetical and standalone years
3273
+ author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
3274
+ author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
3227
3275
 
3228
- # Better organization detection - check if it looks like multiple authors
3229
- is_multi_author = (
3230
- ', and ' in author_text_clean or # "A, B, and C" format
3231
- ' and ' in author_text_clean or # "A and B" format
3232
- re.search(r'\w+,\s+[A-Z]\.', author_text_clean) or # "Last, F." patterns
3233
- (author_text_clean.count(',') >= 2 and len(author_text_clean) > 30) # Multiple commas in longer text
3234
- )
3276
+ # Better organization detection - check if it looks like multiple authors
3277
+ is_multi_author = (
3278
+ ', and ' in author_text_clean or # "A, B, and C" format
3279
+ ' and ' in author_text_clean or # "A and B" format
3280
+ re.search(r'\w+,\s+[A-Z]\.', author_text_clean) or # "Last, F." patterns
3281
+ (author_text_clean.count(',') >= 2 and len(author_text_clean) > 30) # Multiple commas in longer text
3282
+ )
3235
3283
 
3236
- if is_multi_author:
3237
- # Parse multiple authors - use existing logic from parse_authors_with_initials
3238
- try:
3239
- parsed_authors = parse_authors_with_initials(author_text_clean)
3240
- if parsed_authors and len(parsed_authors) > 1:
3241
- # Clean up "and" prefixes, periods, and preserve "et al"
3242
- cleaned_authors = []
3243
- for author in parsed_authors:
3244
- # Remove leading "and"
3245
- author = re.sub(r'^and\s+', '', author.strip())
3246
- # Remove trailing periods that shouldn't be there
3247
- author = clean_author_name(author)
3248
- # Preserve "et al" variants to enable proper author count handling
3249
- if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
3250
- cleaned_authors.append('et al') # Normalize to standard form
3251
- else:
3252
- cleaned_authors.append(author)
3253
- if cleaned_authors:
3254
- ref['authors'] = cleaned_authors
3255
- else:
3256
- # Fallback: try once more with semicolon handling, then simple comma split
3284
+ if is_multi_author:
3285
+ # Parse multiple authors - use existing logic from parse_authors_with_initials
3286
+ try:
3287
+ parsed_authors = parse_authors_with_initials(author_text_clean)
3288
+ if parsed_authors and len(parsed_authors) > 1:
3289
+ # Clean up "and" prefixes, periods, and preserve "et al"
3290
+ cleaned_authors = []
3291
+ for author in parsed_authors:
3292
+ # Remove leading "and"
3293
+ author = re.sub(r'^and\s+', '', author.strip())
3294
+ # Remove trailing periods that shouldn't be there
3295
+ author = clean_author_name(author)
3296
+ # Preserve "et al" variants to enable proper author count handling
3297
+ if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
3298
+ cleaned_authors.append('et al') # Normalize to standard form
3299
+ else:
3300
+ cleaned_authors.append(author)
3301
+ if cleaned_authors:
3302
+ ref['authors'] = cleaned_authors
3303
+ else:
3304
+ # Fallback: try once more with semicolon handling, then simple comma split
3305
+ simple_authors = []
3306
+ try:
3307
+ # Try parsing again with normalized separators
3308
+ normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
3309
+ fallback_authors = parse_authors_with_initials(normalized_text)
3310
+ if fallback_authors and len(fallback_authors) >= 2:
3311
+ simple_authors = fallback_authors
3312
+ else:
3313
+ raise ValueError("Fallback parsing failed")
3314
+ except:
3315
+ # Last resort: naive comma split
3316
+ for a in author_text_clean.split(','):
3317
+ a = a.strip()
3318
+ # Remove "and" prefix and skip short/empty entries
3319
+ a = re.sub(r'^and\s+', '', a)
3320
+ # Clean author name (remove unnecessary periods)
3321
+ a = clean_author_name(a)
3322
+ if a and len(a) > 2:
3323
+ # Preserve "et al" variants to enable proper author count handling
3324
+ if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3325
+ simple_authors.append('et al') # Normalize to standard form
3326
+ else:
3327
+ simple_authors.append(a)
3328
+ elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3329
+ simple_authors.append('et al') # Handle short "et al" variants
3330
+
3331
+ if simple_authors:
3332
+ ref['authors'] = simple_authors
3333
+ except Exception:
3334
+ # Fallback: simple comma split with cleanup
3257
3335
  simple_authors = []
3258
- try:
3259
- # Try parsing again with normalized separators
3260
- normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
3261
- fallback_authors = parse_authors_with_initials(normalized_text)
3262
- if fallback_authors and len(fallback_authors) >= 2:
3263
- simple_authors = fallback_authors
3264
- else:
3265
- raise ValueError("Fallback parsing failed")
3266
- except:
3267
- # Last resort: naive comma split
3268
- for a in author_text_clean.split(','):
3269
- a = a.strip()
3270
- # Remove "and" prefix and skip short/empty entries
3271
- a = re.sub(r'^and\s+', '', a)
3272
- # Clean author name (remove unnecessary periods)
3273
- a = clean_author_name(a)
3274
- if a and len(a) > 2:
3275
- # Preserve "et al" variants to enable proper author count handling
3276
- if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3277
- simple_authors.append('et al') # Normalize to standard form
3278
- else:
3279
- simple_authors.append(a)
3280
- elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3281
- simple_authors.append('et al') # Handle short "et al" variants
3282
-
3336
+ for a in author_text_clean.split(','):
3337
+ a = a.strip()
3338
+ # Remove "and" prefix and skip short/empty entries
3339
+ a = re.sub(r'^and\s+', '', a)
3340
+ # Clean author name (remove unnecessary periods)
3341
+ a = clean_author_name(a)
3342
+ if a and len(a) > 2:
3343
+ # Preserve "et al" variants to enable proper author count handling
3344
+ if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3345
+ simple_authors.append('et al') # Normalize to standard form
3346
+ else:
3347
+ simple_authors.append(a)
3348
+ elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3349
+ simple_authors.append('et al') # Handle short "et al" variants
3283
3350
  if simple_authors:
3284
3351
  ref['authors'] = simple_authors
3285
- except Exception:
3286
- # Fallback: simple comma split with cleanup
3287
- simple_authors = []
3288
- for a in author_text_clean.split(','):
3289
- a = a.strip()
3290
- # Remove "and" prefix and skip short/empty entries
3291
- a = re.sub(r'^and\s+', '', a)
3292
- # Clean author name (remove unnecessary periods)
3293
- a = clean_author_name(a)
3294
- if a and len(a) > 2:
3295
- # Preserve "et al" variants to enable proper author count handling
3296
- if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3297
- simple_authors.append('et al') # Normalize to standard form
3298
- else:
3299
- simple_authors.append(a)
3300
- elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
3301
- simple_authors.append('et al') # Handle short "et al" variants
3302
- if simple_authors:
3303
- ref['authors'] = simple_authors
3304
- else:
3305
- # Single organization author
3306
- author_name = clean_author_name(author_text_clean)
3307
- if author_name and len(author_name) > 2:
3308
- ref['authors'] = [author_name]
3352
+ else:
3353
+ # Single organization author
3354
+ author_name = clean_author_name(author_text_clean)
3355
+ if author_name and len(author_name) > 2:
3356
+ ref['authors'] = [author_name]
3309
3357
 
3310
3358
  # Second part is usually title
3311
- if len(parts) >= 2:
3359
+ if len(parts) >= 2 and not first_part_is_title:
3312
3360
  title_part = parts[1].strip()
3313
3361
 
3314
- # Handle \href{URL}{text} or \href {URL} {text} format
3315
- # Extract URL before stripping LaTeX commands
3316
- # We need to use balanced brace matching because titles can contain
3317
- # nested braces like {LLM} for capitalization protection
3318
- href_url = None
3319
- title_text = None
3362
+ # Check if this is an access note - skip if so
3363
+ title_part_clean = strip_latex_commands(title_part).strip()
3364
+ if is_access_note(title_part_clean):
3365
+ # This is just an access note, not a title
3366
+ pass
3367
+ else:
3368
+ # Check if this is a URL-only part (common for @misc website references)
3369
+ # Pattern: \url{...}, YEAR or just \url{...}
3370
+ # In this case, use the author/organization name as the title instead
3371
+ url_only_match = re.match(r'^\\url\{[^}]+\}(?:\s*,\s*\d{4})?\.?\s*$', title_part)
3372
+ if url_only_match:
3373
+ # This is a URL-only block, not a title
3374
+ # For website/misc references, the org name IS the title
3375
+ # Use the author_part_clean as title if it looks like an org name
3376
+ if author_part_clean and not ref.get('title'):
3377
+ # Organization names are often in braces, clean them up
3378
+ org_title = author_part_clean.strip('{}.')
3379
+ if org_title and len(org_title) > 2:
3380
+ ref['title'] = org_title
3381
+ # Continue to extract URL below
3320
3382
 
3321
- href_start = title_part.find('\\href')
3322
- if href_start != -1:
3323
- # Find first opening brace (URL)
3324
- pos = href_start + 5 # Skip \href
3325
- while pos < len(title_part) and title_part[pos] in ' \t\n':
3326
- pos += 1
3383
+ # Handle \href{URL}{text} or \href {URL} {text} format
3384
+ # Extract URL before stripping LaTeX commands
3385
+ # We need to use balanced brace matching because titles can contain
3386
+ # nested braces like {LLM} for capitalization protection
3387
+ href_url = None
3388
+ title_text = None
3327
3389
 
3328
- if pos < len(title_part) and title_part[pos] == '{':
3329
- # Extract URL using balanced braces
3330
- brace_count = 0
3331
- url_start = pos + 1
3332
- url_end = pos
3333
- for i in range(pos, len(title_part)):
3334
- if title_part[i] == '{':
3335
- brace_count += 1
3336
- elif title_part[i] == '}':
3337
- brace_count -= 1
3338
- if brace_count == 0:
3339
- url_end = i
3340
- break
3390
+ href_start = title_part.find('\\href')
3391
+ if href_start != -1:
3392
+ # Find first opening brace (URL)
3393
+ pos = href_start + 5 # Skip \href
3394
+ while pos < len(title_part) and title_part[pos] in ' \t\n':
3395
+ pos += 1
3341
3396
 
3342
- if url_end > url_start:
3343
- href_url = title_part[url_start:url_end].strip()
3344
-
3345
- # Now find the second brace group (title text)
3346
- pos = url_end + 1
3347
- while pos < len(title_part) and title_part[pos] in ' \t\n':
3348
- pos += 1
3397
+ if pos < len(title_part) and title_part[pos] == '{':
3398
+ # Extract URL using balanced braces
3399
+ brace_count = 0
3400
+ url_start = pos + 1
3401
+ url_end = pos
3402
+ for i in range(pos, len(title_part)):
3403
+ if title_part[i] == '{':
3404
+ brace_count += 1
3405
+ elif title_part[i] == '}':
3406
+ brace_count -= 1
3407
+ if brace_count == 0:
3408
+ url_end = i
3409
+ break
3349
3410
 
3350
- if pos < len(title_part) and title_part[pos] == '{':
3351
- # Extract title text using balanced braces
3352
- brace_count = 0
3353
- text_start = pos + 1
3354
- text_end = pos
3355
- for i in range(pos, len(title_part)):
3356
- if title_part[i] == '{':
3357
- brace_count += 1
3358
- elif title_part[i] == '}':
3359
- brace_count -= 1
3360
- if brace_count == 0:
3361
- text_end = i
3362
- break
3411
+ if url_end > url_start:
3412
+ href_url = title_part[url_start:url_end].strip()
3363
3413
 
3364
- if text_end > text_start:
3365
- title_text = title_part[text_start:text_end].strip()
3366
-
3367
- if href_url and title_text:
3414
+ # Now find the second brace group (title text)
3415
+ pos = url_end + 1
3416
+ while pos < len(title_part) and title_part[pos] in ' \t\n':
3417
+ pos += 1
3418
+
3419
+ if pos < len(title_part) and title_part[pos] == '{':
3420
+ # Extract title text using balanced braces
3421
+ brace_count = 0
3422
+ text_start = pos + 1
3423
+ text_end = pos
3424
+ for i in range(pos, len(title_part)):
3425
+ if title_part[i] == '{':
3426
+ brace_count += 1
3427
+ elif title_part[i] == '}':
3428
+ brace_count -= 1
3429
+ if brace_count == 0:
3430
+ text_end = i
3431
+ break
3432
+
3433
+ if text_end > text_start:
3434
+ title_text = title_part[text_start:text_end].strip()
3368
3435
 
3369
- # Extract DOI if it's a doi.org URL
3370
- if 'doi.org/' in href_url and not ref.get('doi'):
3371
- doi_match = re.search(r'doi\.org/(.+)$', href_url)
3372
- if doi_match:
3373
- ref['doi'] = doi_match.group(1)
3436
+ if href_url and title_text:
3437
+
3438
+ # Extract DOI if it's a doi.org URL
3439
+ if 'doi.org/' in href_url and not ref.get('doi'):
3440
+ doi_match = re.search(r'doi\.org/(.+)$', href_url)
3441
+ if doi_match:
3442
+ ref['doi'] = doi_match.group(1)
3443
+ ref['url'] = href_url
3444
+ # Extract arXiv ID if it's an arxiv URL
3445
+ elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
3446
+ ref['url'] = href_url
3447
+ # Generic URL
3448
+ elif not ref.get('url'):
3374
3449
  ref['url'] = href_url
3375
- # Extract arXiv ID if it's an arxiv URL
3376
- elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
3377
- ref['url'] = href_url
3378
- # Generic URL
3379
- elif not ref.get('url'):
3380
- ref['url'] = href_url
3450
+
3451
+ # Use the title text (second part of href), not the URL
3452
+ title_clean = strip_latex_commands(title_text).strip()
3453
+ elif not url_only_match:
3454
+ # Only extract title from this part if it's not a URL-only block
3455
+ title_clean = strip_latex_commands(title_part).strip()
3456
+ else:
3457
+ # URL-only block - title already set from org name above
3458
+ title_clean = None
3381
3459
 
3382
- # Use the title text (second part of href), not the URL
3383
- title_clean = strip_latex_commands(title_text).strip()
3384
- else:
3385
- title_clean = strip_latex_commands(title_part).strip()
3386
-
3387
- # Remove trailing dots and clean up
3388
- title_clean = title_clean.rstrip('.')
3389
- if title_clean and len(title_clean) > 5: # Reasonable title length
3390
- ref['title'] = title_clean
3460
+ # Remove trailing dots and clean up
3461
+ if title_clean:
3462
+ title_clean = title_clean.rstrip('.')
3463
+ # Also remove leading comma and year pattern that may remain from URL stripping
3464
+ title_clean = re.sub(r'^,\s*\d{4}\s*$', '', title_clean).strip()
3465
+ title_clean = re.sub(r'^,\s*', '', title_clean).strip()
3466
+ if title_clean and len(title_clean) > 5: # Reasonable title length
3467
+ ref['title'] = title_clean
3391
3468
 
3392
3469
  # Third part is usually venue/journal
3393
3470
  if len(parts) >= 3:
3394
3471
  venue_part = parts[2].strip()
3395
3472
  venue_clean = strip_latex_commands(venue_part).strip()
3396
- # Remove "In " prefix if present (common in bbl format)
3397
- venue_clean = re.sub(r'^In\s+', '', venue_clean)
3398
- # Remove trailing year only (at end of string), not year in the middle of venue name
3399
- # e.g., "2020 Conference on..." should keep the conference name
3400
- if ref['year']:
3401
- # Only remove year if it appears at the very end (possibly with punctuation)
3402
- venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
3403
- venue_clean = venue_clean.rstrip(',. ')
3404
- # Filter out common non-venue patterns that shouldn't be treated as venues
3405
- non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
3406
- if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
3407
- ref['journal'] = venue_clean
3473
+
3474
+ # Check if this is an access note - skip if so
3475
+ if is_access_note(venue_clean):
3476
+ pass # Don't treat access notes as venues
3477
+ else:
3478
+ # Remove "In " prefix if present (common in bbl format)
3479
+ venue_clean = re.sub(r'^In\s+', '', venue_clean)
3480
+ # Remove trailing year only (at end of string), not year in the middle of venue name
3481
+ # e.g., "2020 Conference on..." should keep the conference name
3482
+ if ref['year']:
3483
+ # Only remove year if it appears at the very end (possibly with punctuation)
3484
+ venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
3485
+ venue_clean = venue_clean.rstrip(',. ')
3486
+ # Filter out common non-venue patterns that shouldn't be treated as venues
3487
+ non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
3488
+ if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
3489
+ ref['journal'] = venue_clean
3408
3490
 
3409
3491
  # Extract URL if present
3410
3492
  url_match = re.search(r'\\url\{([^}]+)\}', content)