academic-refchecker 2.0.9__py3-none-any.whl → 2.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/METADATA +1 -1
- {academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/RECORD +8 -8
- refchecker/__version__.py +1 -1
- refchecker/utils/text_utils.py +245 -163
- {academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/WHEEL +0 -0
- {academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
academic_refchecker-2.0.
|
|
1
|
+
academic_refchecker-2.0.11.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
2
2
|
backend/__init__.py,sha256=TFVkOx5tSp3abty15RzUbaSwQ9ZD0kfUn7PDh63xkYY,521
|
|
3
3
|
backend/__main__.py,sha256=74V7yUMsRSZaaRyXYm-rZVc3TVUcUgwsoTQTUbV5EqM,211
|
|
4
4
|
backend/cli.py,sha256=xV3l9M5OdNQQYOcrzj2d_7RmCgj7CXP_1oi0TPe6zNo,1672
|
|
@@ -16,7 +16,7 @@ backend/static/assets/index-2P6L_39v.css,sha256=KC3Wa6jfD1qwmEoVpqTovlzf8fsn5oHY
|
|
|
16
16
|
backend/static/assets/index-hk21nqxR.js,sha256=z2agP8ZFYw4AfYi-GJ5E_8_k-lPF-frXOJtPk-I0hDs,369533
|
|
17
17
|
refchecker/__init__.py,sha256=Pg5MrtLxDBRcNYcI02N-bv3tzURVd1S3nQ8IyF7Zw7E,322
|
|
18
18
|
refchecker/__main__.py,sha256=agBbT9iKN0g2xXtRNCoh29Nr7z2n5vU-r0MCVJKi4tI,232
|
|
19
|
-
refchecker/__version__.py,sha256=
|
|
19
|
+
refchecker/__version__.py,sha256=xQXcCOSnpBnaLZygtDKbuiGK368plb0wUEcXNuWi7_s,66
|
|
20
20
|
refchecker/checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
21
21
|
refchecker/checkers/crossref.py,sha256=88moAyTudBqf9SKqTQkNAq1yyuRe95f8r4EpmJznupQ,20937
|
|
22
22
|
refchecker/checkers/enhanced_hybrid_checker.py,sha256=2jIeUX7hankPok3M4de9o2bsJZ17ZomuLkdfdr9EV0s,28671
|
|
@@ -54,11 +54,11 @@ refchecker/utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,
|
|
|
54
54
|
refchecker/utils/doi_utils.py,sha256=_7YvQ0DTOQBMIujUE0SdJicjPiAR3VETLU668GIji24,6094
|
|
55
55
|
refchecker/utils/error_utils.py,sha256=8TcfRUD6phZ7viPJrezQ4jKf_vE65lqEXZq5707eU6s,15425
|
|
56
56
|
refchecker/utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
57
|
-
refchecker/utils/text_utils.py,sha256=
|
|
57
|
+
refchecker/utils/text_utils.py,sha256=ZIdvP75F_4o_p2lB24CkuX_eEjB9x-BY2FlXsOiYjkQ,234082
|
|
58
58
|
refchecker/utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
59
59
|
refchecker/utils/url_utils.py,sha256=7b0rWCQJSajzqOvD7ghsBZPejiq6mUIz6SGhvU_WGDs,9441
|
|
60
|
-
academic_refchecker-2.0.
|
|
61
|
-
academic_refchecker-2.0.
|
|
62
|
-
academic_refchecker-2.0.
|
|
63
|
-
academic_refchecker-2.0.
|
|
64
|
-
academic_refchecker-2.0.
|
|
60
|
+
academic_refchecker-2.0.11.dist-info/METADATA,sha256=oQhQAzud3SET3ya5MUc_z7FCN3FeguPeUYNew2jXSXc,26576
|
|
61
|
+
academic_refchecker-2.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
62
|
+
academic_refchecker-2.0.11.dist-info/entry_points.txt,sha256=9cREsaKwlp05Ql0CBIjKrNHk5IG2cHY5LvJPsV2-SxA,108
|
|
63
|
+
academic_refchecker-2.0.11.dist-info/top_level.txt,sha256=FfNvrvpj25gfpUBjW0epvz7Qrdejhups5Za_DBiSRu4,19
|
|
64
|
+
academic_refchecker-2.0.11.dist-info/RECORD,,
|
refchecker/__version__.py
CHANGED
refchecker/utils/text_utils.py
CHANGED
|
@@ -3095,6 +3095,35 @@ def validate_parsed_references(references):
|
|
|
3095
3095
|
}
|
|
3096
3096
|
|
|
3097
3097
|
|
|
3098
|
+
def is_access_note(text):
|
|
3099
|
+
"""
|
|
3100
|
+
Check if text is an access note like '[Online; accessed DD-MM-YYYY]' or '[Accessed: YYYY-MM-DD]'
|
|
3101
|
+
These should not be treated as titles or venues.
|
|
3102
|
+
|
|
3103
|
+
Args:
|
|
3104
|
+
text: Text to check
|
|
3105
|
+
|
|
3106
|
+
Returns:
|
|
3107
|
+
True if text appears to be an access/retrieval note
|
|
3108
|
+
"""
|
|
3109
|
+
if not text:
|
|
3110
|
+
return False
|
|
3111
|
+
text_clean = text.strip().rstrip('.')
|
|
3112
|
+
# Common patterns for access notes
|
|
3113
|
+
access_patterns = [
|
|
3114
|
+
r'^\[Online;?\s*accessed\s+[\d\-/]+\]$', # [Online; accessed 07-12-2024]
|
|
3115
|
+
r'^\[Accessed:?\s+[\d\-/]+\]$', # [Accessed: 2024-07-12]
|
|
3116
|
+
r'^\[Online\]$', # [Online]
|
|
3117
|
+
r'^\[accessed\s+[\d\-/]+\]$', # [accessed 07-12-2024]
|
|
3118
|
+
r'^\[Online,?\s+accessed\s+[\d\-/]+\]$', # [Online, accessed 07-12-2024]
|
|
3119
|
+
r'^Online;\s*accessed\s+[\d\-/]+$', # Online; accessed 07-12-2024 (without brackets)
|
|
3120
|
+
]
|
|
3121
|
+
for pattern in access_patterns:
|
|
3122
|
+
if re.match(pattern, text_clean, re.IGNORECASE):
|
|
3123
|
+
return True
|
|
3124
|
+
return False
|
|
3125
|
+
|
|
3126
|
+
|
|
3098
3127
|
def extract_latex_references(text, file_path=None): # pylint: disable=unused-argument
|
|
3099
3128
|
"""
|
|
3100
3129
|
Extract references from LaTeX content programmatically
|
|
@@ -3220,191 +3249,244 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3220
3249
|
# Clean and extract authors
|
|
3221
3250
|
author_part_clean = strip_latex_commands(author_part).strip()
|
|
3222
3251
|
|
|
3223
|
-
#
|
|
3224
|
-
#
|
|
3225
|
-
|
|
3226
|
-
|
|
3252
|
+
# Special case: Check if second part is just an access note like [Online; accessed ...]
|
|
3253
|
+
# This indicates the reference has no authors, and the first part is actually the title
|
|
3254
|
+
# e.g., "The caida anonymized internet traces.\n\newblock [Online; accessed 07-12-2024]."
|
|
3255
|
+
first_part_is_title = False
|
|
3256
|
+
if len(parts) >= 2:
|
|
3257
|
+
second_part_clean = strip_latex_commands(parts[1]).strip()
|
|
3258
|
+
if is_access_note(second_part_clean):
|
|
3259
|
+
first_part_is_title = True
|
|
3260
|
+
# Use first part as title, not authors
|
|
3261
|
+
title_text_from_first = author_part_clean.rstrip('.')
|
|
3262
|
+
if title_text_from_first and len(title_text_from_first) > 5:
|
|
3263
|
+
ref['title'] = title_text_from_first
|
|
3264
|
+
# Don't set authors - this reference has none (or just a dataset name)
|
|
3265
|
+
|
|
3266
|
+
if first_part_is_title:
|
|
3267
|
+
# Skip normal author/title parsing - already handled above
|
|
3268
|
+
pass
|
|
3269
|
+
else:
|
|
3270
|
+
# Normal case: first part contains authors
|
|
3271
|
+
# Simple fix: just improve the organization detection without complex parsing
|
|
3272
|
+
# Remove year pattern first - handle both parenthetical and standalone years
|
|
3273
|
+
author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
|
|
3274
|
+
author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
|
|
3227
3275
|
|
|
3228
|
-
|
|
3229
|
-
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3276
|
+
# Better organization detection - check if it looks like multiple authors
|
|
3277
|
+
is_multi_author = (
|
|
3278
|
+
', and ' in author_text_clean or # "A, B, and C" format
|
|
3279
|
+
' and ' in author_text_clean or # "A and B" format
|
|
3280
|
+
re.search(r'\w+,\s+[A-Z]\.', author_text_clean) or # "Last, F." patterns
|
|
3281
|
+
(author_text_clean.count(',') >= 2 and len(author_text_clean) > 30) # Multiple commas in longer text
|
|
3282
|
+
)
|
|
3235
3283
|
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3284
|
+
if is_multi_author:
|
|
3285
|
+
# Parse multiple authors - use existing logic from parse_authors_with_initials
|
|
3286
|
+
try:
|
|
3287
|
+
parsed_authors = parse_authors_with_initials(author_text_clean)
|
|
3288
|
+
if parsed_authors and len(parsed_authors) > 1:
|
|
3289
|
+
# Clean up "and" prefixes, periods, and preserve "et al"
|
|
3290
|
+
cleaned_authors = []
|
|
3291
|
+
for author in parsed_authors:
|
|
3292
|
+
# Remove leading "and"
|
|
3293
|
+
author = re.sub(r'^and\s+', '', author.strip())
|
|
3294
|
+
# Remove trailing periods that shouldn't be there
|
|
3295
|
+
author = clean_author_name(author)
|
|
3296
|
+
# Preserve "et al" variants to enable proper author count handling
|
|
3297
|
+
if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
|
|
3298
|
+
cleaned_authors.append('et al') # Normalize to standard form
|
|
3299
|
+
else:
|
|
3300
|
+
cleaned_authors.append(author)
|
|
3301
|
+
if cleaned_authors:
|
|
3302
|
+
ref['authors'] = cleaned_authors
|
|
3303
|
+
else:
|
|
3304
|
+
# Fallback: try once more with semicolon handling, then simple comma split
|
|
3305
|
+
simple_authors = []
|
|
3306
|
+
try:
|
|
3307
|
+
# Try parsing again with normalized separators
|
|
3308
|
+
normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
|
|
3309
|
+
fallback_authors = parse_authors_with_initials(normalized_text)
|
|
3310
|
+
if fallback_authors and len(fallback_authors) >= 2:
|
|
3311
|
+
simple_authors = fallback_authors
|
|
3312
|
+
else:
|
|
3313
|
+
raise ValueError("Fallback parsing failed")
|
|
3314
|
+
except:
|
|
3315
|
+
# Last resort: naive comma split
|
|
3316
|
+
for a in author_text_clean.split(','):
|
|
3317
|
+
a = a.strip()
|
|
3318
|
+
# Remove "and" prefix and skip short/empty entries
|
|
3319
|
+
a = re.sub(r'^and\s+', '', a)
|
|
3320
|
+
# Clean author name (remove unnecessary periods)
|
|
3321
|
+
a = clean_author_name(a)
|
|
3322
|
+
if a and len(a) > 2:
|
|
3323
|
+
# Preserve "et al" variants to enable proper author count handling
|
|
3324
|
+
if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3325
|
+
simple_authors.append('et al') # Normalize to standard form
|
|
3326
|
+
else:
|
|
3327
|
+
simple_authors.append(a)
|
|
3328
|
+
elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3329
|
+
simple_authors.append('et al') # Handle short "et al" variants
|
|
3330
|
+
|
|
3331
|
+
if simple_authors:
|
|
3332
|
+
ref['authors'] = simple_authors
|
|
3333
|
+
except Exception:
|
|
3334
|
+
# Fallback: simple comma split with cleanup
|
|
3257
3335
|
simple_authors = []
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
# Clean author name (remove unnecessary periods)
|
|
3273
|
-
a = clean_author_name(a)
|
|
3274
|
-
if a and len(a) > 2:
|
|
3275
|
-
# Preserve "et al" variants to enable proper author count handling
|
|
3276
|
-
if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3277
|
-
simple_authors.append('et al') # Normalize to standard form
|
|
3278
|
-
else:
|
|
3279
|
-
simple_authors.append(a)
|
|
3280
|
-
elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3281
|
-
simple_authors.append('et al') # Handle short "et al" variants
|
|
3282
|
-
|
|
3336
|
+
for a in author_text_clean.split(','):
|
|
3337
|
+
a = a.strip()
|
|
3338
|
+
# Remove "and" prefix and skip short/empty entries
|
|
3339
|
+
a = re.sub(r'^and\s+', '', a)
|
|
3340
|
+
# Clean author name (remove unnecessary periods)
|
|
3341
|
+
a = clean_author_name(a)
|
|
3342
|
+
if a and len(a) > 2:
|
|
3343
|
+
# Preserve "et al" variants to enable proper author count handling
|
|
3344
|
+
if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3345
|
+
simple_authors.append('et al') # Normalize to standard form
|
|
3346
|
+
else:
|
|
3347
|
+
simple_authors.append(a)
|
|
3348
|
+
elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3349
|
+
simple_authors.append('et al') # Handle short "et al" variants
|
|
3283
3350
|
if simple_authors:
|
|
3284
3351
|
ref['authors'] = simple_authors
|
|
3285
|
-
|
|
3286
|
-
#
|
|
3287
|
-
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
# Remove "and" prefix and skip short/empty entries
|
|
3291
|
-
a = re.sub(r'^and\s+', '', a)
|
|
3292
|
-
# Clean author name (remove unnecessary periods)
|
|
3293
|
-
a = clean_author_name(a)
|
|
3294
|
-
if a and len(a) > 2:
|
|
3295
|
-
# Preserve "et al" variants to enable proper author count handling
|
|
3296
|
-
if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3297
|
-
simple_authors.append('et al') # Normalize to standard form
|
|
3298
|
-
else:
|
|
3299
|
-
simple_authors.append(a)
|
|
3300
|
-
elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
3301
|
-
simple_authors.append('et al') # Handle short "et al" variants
|
|
3302
|
-
if simple_authors:
|
|
3303
|
-
ref['authors'] = simple_authors
|
|
3304
|
-
else:
|
|
3305
|
-
# Single organization author
|
|
3306
|
-
author_name = clean_author_name(author_text_clean)
|
|
3307
|
-
if author_name and len(author_name) > 2:
|
|
3308
|
-
ref['authors'] = [author_name]
|
|
3352
|
+
else:
|
|
3353
|
+
# Single organization author
|
|
3354
|
+
author_name = clean_author_name(author_text_clean)
|
|
3355
|
+
if author_name and len(author_name) > 2:
|
|
3356
|
+
ref['authors'] = [author_name]
|
|
3309
3357
|
|
|
3310
3358
|
# Second part is usually title
|
|
3311
|
-
if len(parts) >= 2:
|
|
3359
|
+
if len(parts) >= 2 and not first_part_is_title:
|
|
3312
3360
|
title_part = parts[1].strip()
|
|
3313
3361
|
|
|
3314
|
-
#
|
|
3315
|
-
|
|
3316
|
-
|
|
3317
|
-
|
|
3318
|
-
|
|
3319
|
-
|
|
3362
|
+
# Check if this is an access note - skip if so
|
|
3363
|
+
title_part_clean = strip_latex_commands(title_part).strip()
|
|
3364
|
+
if is_access_note(title_part_clean):
|
|
3365
|
+
# This is just an access note, not a title
|
|
3366
|
+
pass
|
|
3367
|
+
else:
|
|
3368
|
+
# Check if this is a URL-only part (common for @misc website references)
|
|
3369
|
+
# Pattern: \url{...}, YEAR or just \url{...}
|
|
3370
|
+
# In this case, use the author/organization name as the title instead
|
|
3371
|
+
url_only_match = re.match(r'^\\url\{[^}]+\}(?:\s*,\s*\d{4})?\.?\s*$', title_part)
|
|
3372
|
+
if url_only_match:
|
|
3373
|
+
# This is a URL-only block, not a title
|
|
3374
|
+
# For website/misc references, the org name IS the title
|
|
3375
|
+
# Use the author_part_clean as title if it looks like an org name
|
|
3376
|
+
if author_part_clean and not ref.get('title'):
|
|
3377
|
+
# Organization names are often in braces, clean them up
|
|
3378
|
+
org_title = author_part_clean.strip('{}.')
|
|
3379
|
+
if org_title and len(org_title) > 2:
|
|
3380
|
+
ref['title'] = org_title
|
|
3381
|
+
# Continue to extract URL below
|
|
3320
3382
|
|
|
3321
|
-
|
|
3322
|
-
|
|
3323
|
-
#
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3383
|
+
# Handle \href{URL}{text} or \href {URL} {text} format
|
|
3384
|
+
# Extract URL before stripping LaTeX commands
|
|
3385
|
+
# We need to use balanced brace matching because titles can contain
|
|
3386
|
+
# nested braces like {LLM} for capitalization protection
|
|
3387
|
+
href_url = None
|
|
3388
|
+
title_text = None
|
|
3327
3389
|
|
|
3328
|
-
|
|
3329
|
-
|
|
3330
|
-
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
if title_part[i] == '{':
|
|
3335
|
-
brace_count += 1
|
|
3336
|
-
elif title_part[i] == '}':
|
|
3337
|
-
brace_count -= 1
|
|
3338
|
-
if brace_count == 0:
|
|
3339
|
-
url_end = i
|
|
3340
|
-
break
|
|
3390
|
+
href_start = title_part.find('\\href')
|
|
3391
|
+
if href_start != -1:
|
|
3392
|
+
# Find first opening brace (URL)
|
|
3393
|
+
pos = href_start + 5 # Skip \href
|
|
3394
|
+
while pos < len(title_part) and title_part[pos] in ' \t\n':
|
|
3395
|
+
pos += 1
|
|
3341
3396
|
|
|
3342
|
-
if
|
|
3343
|
-
|
|
3344
|
-
|
|
3345
|
-
|
|
3346
|
-
|
|
3347
|
-
|
|
3348
|
-
|
|
3397
|
+
if pos < len(title_part) and title_part[pos] == '{':
|
|
3398
|
+
# Extract URL using balanced braces
|
|
3399
|
+
brace_count = 0
|
|
3400
|
+
url_start = pos + 1
|
|
3401
|
+
url_end = pos
|
|
3402
|
+
for i in range(pos, len(title_part)):
|
|
3403
|
+
if title_part[i] == '{':
|
|
3404
|
+
brace_count += 1
|
|
3405
|
+
elif title_part[i] == '}':
|
|
3406
|
+
brace_count -= 1
|
|
3407
|
+
if brace_count == 0:
|
|
3408
|
+
url_end = i
|
|
3409
|
+
break
|
|
3349
3410
|
|
|
3350
|
-
if
|
|
3351
|
-
|
|
3352
|
-
brace_count = 0
|
|
3353
|
-
text_start = pos + 1
|
|
3354
|
-
text_end = pos
|
|
3355
|
-
for i in range(pos, len(title_part)):
|
|
3356
|
-
if title_part[i] == '{':
|
|
3357
|
-
brace_count += 1
|
|
3358
|
-
elif title_part[i] == '}':
|
|
3359
|
-
brace_count -= 1
|
|
3360
|
-
if brace_count == 0:
|
|
3361
|
-
text_end = i
|
|
3362
|
-
break
|
|
3411
|
+
if url_end > url_start:
|
|
3412
|
+
href_url = title_part[url_start:url_end].strip()
|
|
3363
3413
|
|
|
3364
|
-
|
|
3365
|
-
|
|
3366
|
-
|
|
3367
|
-
|
|
3414
|
+
# Now find the second brace group (title text)
|
|
3415
|
+
pos = url_end + 1
|
|
3416
|
+
while pos < len(title_part) and title_part[pos] in ' \t\n':
|
|
3417
|
+
pos += 1
|
|
3418
|
+
|
|
3419
|
+
if pos < len(title_part) and title_part[pos] == '{':
|
|
3420
|
+
# Extract title text using balanced braces
|
|
3421
|
+
brace_count = 0
|
|
3422
|
+
text_start = pos + 1
|
|
3423
|
+
text_end = pos
|
|
3424
|
+
for i in range(pos, len(title_part)):
|
|
3425
|
+
if title_part[i] == '{':
|
|
3426
|
+
brace_count += 1
|
|
3427
|
+
elif title_part[i] == '}':
|
|
3428
|
+
brace_count -= 1
|
|
3429
|
+
if brace_count == 0:
|
|
3430
|
+
text_end = i
|
|
3431
|
+
break
|
|
3432
|
+
|
|
3433
|
+
if text_end > text_start:
|
|
3434
|
+
title_text = title_part[text_start:text_end].strip()
|
|
3368
3435
|
|
|
3369
|
-
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
if
|
|
3373
|
-
|
|
3436
|
+
if href_url and title_text:
|
|
3437
|
+
|
|
3438
|
+
# Extract DOI if it's a doi.org URL
|
|
3439
|
+
if 'doi.org/' in href_url and not ref.get('doi'):
|
|
3440
|
+
doi_match = re.search(r'doi\.org/(.+)$', href_url)
|
|
3441
|
+
if doi_match:
|
|
3442
|
+
ref['doi'] = doi_match.group(1)
|
|
3443
|
+
ref['url'] = href_url
|
|
3444
|
+
# Extract arXiv ID if it's an arxiv URL
|
|
3445
|
+
elif 'arxiv.org/' in href_url.lower() and not ref.get('url'):
|
|
3446
|
+
ref['url'] = href_url
|
|
3447
|
+
# Generic URL
|
|
3448
|
+
elif not ref.get('url'):
|
|
3374
3449
|
ref['url'] = href_url
|
|
3375
|
-
|
|
3376
|
-
|
|
3377
|
-
|
|
3378
|
-
|
|
3379
|
-
|
|
3380
|
-
|
|
3450
|
+
|
|
3451
|
+
# Use the title text (second part of href), not the URL
|
|
3452
|
+
title_clean = strip_latex_commands(title_text).strip()
|
|
3453
|
+
elif not url_only_match:
|
|
3454
|
+
# Only extract title from this part if it's not a URL-only block
|
|
3455
|
+
title_clean = strip_latex_commands(title_part).strip()
|
|
3456
|
+
else:
|
|
3457
|
+
# URL-only block - title already set from org name above
|
|
3458
|
+
title_clean = None
|
|
3381
3459
|
|
|
3382
|
-
#
|
|
3383
|
-
title_clean
|
|
3384
|
-
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3388
|
-
|
|
3389
|
-
|
|
3390
|
-
ref['title'] = title_clean
|
|
3460
|
+
# Remove trailing dots and clean up
|
|
3461
|
+
if title_clean:
|
|
3462
|
+
title_clean = title_clean.rstrip('.')
|
|
3463
|
+
# Also remove leading comma and year pattern that may remain from URL stripping
|
|
3464
|
+
title_clean = re.sub(r'^,\s*\d{4}\s*$', '', title_clean).strip()
|
|
3465
|
+
title_clean = re.sub(r'^,\s*', '', title_clean).strip()
|
|
3466
|
+
if title_clean and len(title_clean) > 5: # Reasonable title length
|
|
3467
|
+
ref['title'] = title_clean
|
|
3391
3468
|
|
|
3392
3469
|
# Third part is usually venue/journal
|
|
3393
3470
|
if len(parts) >= 3:
|
|
3394
3471
|
venue_part = parts[2].strip()
|
|
3395
3472
|
venue_clean = strip_latex_commands(venue_part).strip()
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
|
|
3399
|
-
|
|
3400
|
-
|
|
3401
|
-
#
|
|
3402
|
-
venue_clean = re.sub(
|
|
3403
|
-
|
|
3404
|
-
|
|
3405
|
-
|
|
3406
|
-
|
|
3407
|
-
|
|
3473
|
+
|
|
3474
|
+
# Check if this is an access note - skip if so
|
|
3475
|
+
if is_access_note(venue_clean):
|
|
3476
|
+
pass # Don't treat access notes as venues
|
|
3477
|
+
else:
|
|
3478
|
+
# Remove "In " prefix if present (common in bbl format)
|
|
3479
|
+
venue_clean = re.sub(r'^In\s+', '', venue_clean)
|
|
3480
|
+
# Remove trailing year only (at end of string), not year in the middle of venue name
|
|
3481
|
+
# e.g., "2020 Conference on..." should keep the conference name
|
|
3482
|
+
if ref['year']:
|
|
3483
|
+
# Only remove year if it appears at the very end (possibly with punctuation)
|
|
3484
|
+
venue_clean = re.sub(rf',?\s*{ref["year"]}\s*\.?\s*$', '', venue_clean)
|
|
3485
|
+
venue_clean = venue_clean.rstrip(',. ')
|
|
3486
|
+
# Filter out common non-venue patterns that shouldn't be treated as venues
|
|
3487
|
+
non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
|
|
3488
|
+
if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
|
|
3489
|
+
ref['journal'] = venue_clean
|
|
3408
3490
|
|
|
3409
3491
|
# Extract URL if present
|
|
3410
3492
|
url_match = re.search(r'\\url\{([^}]+)\}', content)
|
|
File without changes
|
{academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-2.0.9.dist-info → academic_refchecker-2.0.11.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|