academic-refchecker 1.2.42__tar.gz → 1.2.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.42/src/academic_refchecker.egg-info → academic_refchecker-1.2.44}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/refchecker.py +1 -20
  5. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/bibtex_parser.py +104 -27
  6. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/text_utils.py +64 -1
  7. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/url_utils.py +44 -1
  8. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/LICENSE +0 -0
  9. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/MANIFEST.in +0 -0
  10. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/README.md +0 -0
  11. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/pyproject.toml +0 -0
  12. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/requirements.txt +0 -0
  13. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/scripts/download_db.py +0 -0
  14. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/scripts/run_tests.py +0 -0
  15. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/scripts/start_vllm_server.py +0 -0
  16. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/setup.cfg +0 -0
  17. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/__init__.py +0 -0
  18. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  19. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  20. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  21. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/requires.txt +0 -0
  22. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  23. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/__init__.py +0 -0
  24. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/crossref.py +0 -0
  25. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/enhanced_hybrid_checker.py +0 -0
  26. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/github_checker.py +0 -0
  27. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/local_semantic_scholar.py +0 -0
  28. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/openalex.py +0 -0
  29. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/openreview_checker.py +0 -0
  30. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/semantic_scholar.py +0 -0
  31. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/checkers/webpage_checker.py +0 -0
  32. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/config/__init__.py +0 -0
  33. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/config/logging.conf +0 -0
  34. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/config/settings.py +0 -0
  35. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/__init__.py +0 -0
  36. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/db_connection_pool.py +0 -0
  37. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/core/parallel_processor.py +0 -0
  38. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/database/__init__.py +0 -0
  39. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/database/download_semantic_scholar_db.py +0 -0
  40. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/llm/__init__.py +0 -0
  41. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/llm/base.py +0 -0
  42. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/llm/providers.py +0 -0
  43. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/scripts/__init__.py +0 -0
  44. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/scripts/start_vllm_server.py +0 -0
  45. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/services/__init__.py +0 -0
  46. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/services/pdf_processor.py +0 -0
  47. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/__init__.py +0 -0
  48. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/arxiv_utils.py +0 -0
  49. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/author_utils.py +0 -0
  50. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/biblatex_parser.py +0 -0
  51. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/bibliography_utils.py +0 -0
  52. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/config_validator.py +0 -0
  53. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/db_utils.py +0 -0
  54. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/doi_utils.py +0 -0
  55. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/error_utils.py +0 -0
  56. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/mock_objects.py +0 -0
  57. {academic_refchecker-1.2.42 → academic_refchecker-1.2.44}/src/utils/unicode_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.42
3
+ Version: 1.2.44
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.42"
3
+ __version__ = "1.2.44"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.42
3
+ Version: 1.2.44
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -5181,7 +5181,7 @@ class ArxivReferenceChecker:
5181
5181
  from utils.text_utils import format_authors_for_display
5182
5182
  authors = format_authors_for_display(reference.get('authors', []))
5183
5183
  year = reference.get('year', '')
5184
- venue = reference.get('venue', '')
5184
+ venue = reference.get('venue', '') or reference.get('journal', '')
5185
5185
  url = reference.get('url', '')
5186
5186
  doi = reference.get('doi', '')
5187
5187
  # Extract actual reference number from raw text for accurate display
@@ -5407,25 +5407,6 @@ class ArxivReferenceChecker:
5407
5407
  if error_details:
5408
5408
  subreason = self._categorize_unverified_reason(error_details)
5409
5409
  print(f" Subreason: {subreason}")
5410
-
5411
- year_str = self._format_year_string(reference.get('year'))
5412
-
5413
- # Apply LaTeX cleaning and formatting to authors for display
5414
- authors = reference.get('authors', [])
5415
- if authors:
5416
- from utils.text_utils import strip_latex_commands, format_authors_for_display
5417
- cleaned_authors = [strip_latex_commands(author) for author in authors]
5418
- authors_display = format_authors_for_display(cleaned_authors)
5419
- else:
5420
- authors_display = 'Unknown authors'
5421
-
5422
- # Only show URL if it exists and is different from reference_url
5423
- ref_url = reference.get('url', '').strip()
5424
- if ref_url and ref_url != reference_url:
5425
- # Clean trailing punctuation from URL display
5426
- from utils.url_utils import clean_url_punctuation
5427
- clean_ref_url = clean_url_punctuation(ref_url)
5428
- print(f" URL: {clean_ref_url}")
5429
5410
 
5430
5411
  def _categorize_unverified_reason(self, error_details):
5431
5412
  """Categorize the unverified error into checker error or not found"""
@@ -103,37 +103,99 @@ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) ->
103
103
  Returns:
104
104
  Dictionary with parsed entry data
105
105
  """
106
- # Extract fields using regex
107
106
  fields = {}
108
107
 
109
- # Pattern to match field = {value} or field = "value"
110
- # Handle nested braces properly
111
- field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
112
-
113
- for match in re.finditer(field_pattern, content, re.DOTALL):
114
- field_name = match.group(1).lower()
115
- field_value = match.group(2) or match.group(3) or ""
116
- # Strip outer quotes if present (handles cases like title = {"Some Title"})
117
- field_value = field_value.strip()
118
- if field_value.startswith('"') and field_value.endswith('"'):
119
- field_value = field_value[1:-1]
120
- fields[field_name] = field_value
108
+ # Use a more robust approach with manual parsing
109
+ i = 0
110
+ while i < len(content):
111
+ # Skip whitespace
112
+ while i < len(content) and content[i].isspace():
113
+ i += 1
114
+
115
+ if i >= len(content):
116
+ break
117
+
118
+ # Look for field name
119
+ field_start = i
120
+ while i < len(content) and (content[i].isalnum() or content[i] == '_'):
121
+ i += 1
122
+
123
+ if i == field_start:
124
+ i += 1 # Skip non-alphanumeric character
125
+ continue
126
+
127
+ field_name = content[field_start:i].lower()
128
+
129
+ # Skip whitespace
130
+ while i < len(content) and content[i].isspace():
131
+ i += 1
132
+
133
+ # Look for equals sign
134
+ if i >= len(content) or content[i] != '=':
135
+ continue
136
+ i += 1 # Skip '='
137
+
138
+ # Skip whitespace
139
+ while i < len(content) and content[i].isspace():
140
+ i += 1
141
+
142
+ if i >= len(content):
143
+ break
144
+
145
+ # Parse field value
146
+ field_value = ""
147
+ if content[i] == '"':
148
+ # Handle quoted strings
149
+ i += 1 # Skip opening quote
150
+ value_start = i
151
+ while i < len(content) and content[i] != '"':
152
+ i += 1
153
+ if i < len(content):
154
+ field_value = content[value_start:i]
155
+ i += 1 # Skip closing quote
156
+ elif content[i] == '{':
157
+ # Handle braced strings with proper nesting
158
+ brace_count = 0
159
+ value_start = i + 1 # Skip opening brace
160
+ i += 1
161
+ while i < len(content):
162
+ if content[i] == '{':
163
+ brace_count += 1
164
+ elif content[i] == '}':
165
+ if brace_count == 0:
166
+ break
167
+ brace_count -= 1
168
+ i += 1
169
+
170
+ if i < len(content):
171
+ field_value = content[value_start:i]
172
+ i += 1 # Skip closing brace
173
+
174
+ if field_value:
175
+ field_value = field_value.strip()
176
+ # Strip outer quotes if present (handles cases like title = {"Some Title"})
177
+ if field_value.startswith('"') and field_value.endswith('"'):
178
+ field_value = field_value[1:-1]
179
+ fields[field_name] = field_value
180
+
181
+ # Skip to next field (look for comma)
182
+ while i < len(content) and content[i] not in ',}':
183
+ i += 1
184
+ if i < len(content) and content[i] == ',':
185
+ i += 1
121
186
 
122
- # If field extraction failed, try a simpler approach
187
+ # Fallback to regex if manual parsing failed
123
188
  if not fields:
124
- logger.debug("Field extraction failed, trying line-by-line approach")
125
- lines = content.split('\n')
126
- for line in lines:
127
- line = line.strip()
128
- if '=' in line:
129
- field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
130
- if field_match:
131
- field_name = field_match.group(1).lower()
132
- field_value = field_match.group(2).strip()
133
- # Strip outer quotes if present
134
- if field_value.startswith('"') and field_value.endswith('"'):
135
- field_value = field_value[1:-1]
136
- fields[field_name] = field_value
189
+ logger.debug("Manual parsing failed, trying regex approach")
190
+ field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
191
+
192
+ for match in re.finditer(field_pattern, content, re.DOTALL):
193
+ field_name = match.group(1).lower()
194
+ field_value = match.group(2) or match.group(3) or ""
195
+ field_value = field_value.strip()
196
+ if field_value.startswith('"') and field_value.endswith('"'):
197
+ field_value = field_value[1:-1]
198
+ fields[field_name] = field_value
137
199
 
138
200
  return {
139
201
  'type': entry_type,
@@ -216,6 +278,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
216
278
 
217
279
  # Extract journal/venue
218
280
  journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
281
+ # Remove braces from journal/venue names
282
+ if journal and journal.startswith('{') and journal.endswith('}'):
283
+ journal = journal[1:-1]
219
284
 
220
285
  # Extract DOI and construct URL
221
286
  doi = fields.get('doi', '')
@@ -225,6 +290,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
225
290
 
226
291
  # Extract other URLs
227
292
  url = fields.get('url', '')
293
+ if url:
294
+ from utils.url_utils import clean_url
295
+ url = clean_url(url)
228
296
 
229
297
  # Handle special @misc entries with only howpublished field
230
298
  if not title and not authors and entry_type == 'misc':
@@ -249,6 +317,10 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
249
317
  else:
250
318
  url = howpublished
251
319
 
320
+ # Clean the reconstructed URL
321
+ from utils.url_utils import clean_url
322
+ url = clean_url(url)
323
+
252
324
  # Generate title from domain/path
253
325
  if 'jailbreakchat.com' in domain:
254
326
  title = 'JailbreakChat Website'
@@ -275,6 +347,11 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
275
347
 
276
348
  if url.startswith('\\url{') and url.endswith('}'):
277
349
  url = url[5:-1] # Remove \url{...}
350
+
351
+ # Clean any URL we extracted
352
+ if url:
353
+ from utils.url_utils import clean_url
354
+ url = clean_url(url)
278
355
 
279
356
  # Construct ArXiv URL from eprint field if no URL present
280
357
  if not url and not doi_url:
@@ -11,6 +11,69 @@ from typing import List
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def expand_abbreviations(text: str) -> str:
15
+ """
16
+ Generic abbreviation expansion using common academic patterns.
17
+
18
+ This function expands common academic abbreviations to their full forms
19
+ to improve venue name matching and comparison.
20
+
21
+ Args:
22
+ text: Text containing potential abbreviations
23
+
24
+ Returns:
25
+ Text with abbreviations expanded
26
+ """
27
+ if not text:
28
+ return text
29
+
30
+ common_abbrevs = {
31
+ # IEEE specific abbreviations (only expand with periods, not full words)
32
+ 'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
33
+ 'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
34
+ 'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
35
+ 'mechatron.': 'mechatronics', 'intell.': 'intelligence',
36
+ 'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
37
+ # General academic abbreviations (only expand with periods)
38
+ 'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
39
+ 'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
40
+ 'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
41
+ 'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
42
+ 'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
43
+ 'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
44
+ 'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
45
+ 'workshop': 'workshop', 'worksh.': 'workshop',
46
+ 'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
47
+ # Physics journal abbreviations
48
+ 'phys.': 'physics', 'phys. rev.': 'physical review',
49
+ 'phys. rev. lett.': 'physical review letters',
50
+ 'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
51
+ 'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
52
+ 'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
53
+ 'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
54
+ 'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
55
+ 'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
56
+ 'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
57
+ # Nature journals
58
+ 'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
59
+ # Handle specific multi-word patterns and well-known acronyms
60
+ 'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
61
+ 'pnas': 'proceedings of the national academy of sciences',
62
+ 'neurips': 'neural information processing systems',
63
+ }
64
+
65
+ # Sort by length (longest first) to ensure longer matches take precedence
66
+ for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
67
+ # For abbreviations ending in period, use word boundary at start only
68
+ if abbrev.endswith('.'):
69
+ pattern = r'\b' + re.escape(abbrev)
70
+ else:
71
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
72
+ text = re.sub(pattern, expansion, text)
73
+
74
+ return text
75
+
76
+
14
77
  def normalize_apostrophes(text):
15
78
  """
16
79
  Normalize all apostrophe variants to standard ASCII apostrophe
@@ -4531,7 +4594,7 @@ def normalize_venue_for_display(venue: str) -> str:
4531
4594
  prefixes_to_remove = [
4532
4595
  r'^\d{4}\s+\d+(st|nd|rd|th)\s+', # "2012 IEEE/RSJ"
4533
4596
  r'^\d{4}\s+', # "2024 "
4534
- r'^proceedings\s+(of\s+)?(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proceedings of the IEEE"
4597
+ r'^proceedings\s+(of\s+)?(the\s+)?((acm|ieee|usenix|aaai|sigcomm|sigkdd|sigmod|sigops|vldb|osdi|sosp|eurosys)\s+)*(\d+(st|nd|rd|th)\s+)?', # "Proceedings of the [ORG] [ORG] 29th"
4535
4598
  r'^proc\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Proc. of the IEEE" (require "of")
4536
4599
  r'^procs\.\s+of\s+(the\s+)?(\d+(st|nd|rd|th)\s+)?(ieee\s+)?', # "Procs. of the IEEE" (require "of")
4537
4600
  r'^in\s+',
@@ -209,7 +209,14 @@ def validate_url_format(url: str) -> bool:
209
209
 
210
210
  def clean_url(url: str) -> str:
211
211
  """
212
- Clean a URL by removing common issues like extra spaces, fragments, etc.
212
+ Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
213
+
214
+ This function handles:
215
+ - Whitespace trimming
216
+ - Malformed LaTeX URL wrappers like \\url{https://...}
217
+ - Markdown-style links like [text](url)
218
+ - Trailing punctuation from academic references
219
+ - DOI URL query parameter cleanup
213
220
 
214
221
  Args:
215
222
  url: URL to clean
@@ -223,6 +230,26 @@ def clean_url(url: str) -> str:
223
230
  # Remove leading/trailing whitespace
224
231
  url = url.strip()
225
232
 
233
+ # Handle malformed URLs that contain \url{} wrappers within the URL text
234
+ # e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
235
+ import re
236
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
237
+ url_match = re.search(url_pattern, url)
238
+ if url_match:
239
+ url = url_match.group(1)
240
+
241
+ # Handle markdown-style links like [text](url) or [url](url)
242
+ # e.g., "[https://example.com](https://example.com)" -> "https://example.com"
243
+ markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
244
+ markdown_match = re.search(markdown_pattern, url)
245
+ if markdown_match:
246
+ # Use the URL from parentheses
247
+ url = markdown_match.group(2)
248
+
249
+ # Remove trailing punctuation that's commonly part of sentence structure
250
+ # but preserve legitimate URL characters
251
+ url = url.rstrip('.,;!?)')
252
+
226
253
  # Note: Preserving query parameters for all URLs now
227
254
  # Previously this function removed query parameters for non-DOI URLs,
228
255
  # but this was causing issues with OpenReview and other URLs that need their parameters
@@ -254,6 +281,22 @@ def clean_url_punctuation(url: str) -> str:
254
281
  # Remove leading/trailing whitespace
255
282
  url = url.strip()
256
283
 
284
+ # Handle malformed URLs that contain \\url{} wrappers within the URL text
285
+ # e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
286
+ import re
287
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
288
+ url_match = re.search(url_pattern, url)
289
+ if url_match:
290
+ url = url_match.group(1)
291
+
292
+ # Handle markdown-style links like [text](url) or [url](url)
293
+ # e.g., "[https://example.com](https://example.com)" -> "https://example.com"
294
+ markdown_pattern = r'\[([^\]]*)\]\((https?://[^)]+)\)'
295
+ markdown_match = re.search(markdown_pattern, url)
296
+ if markdown_match:
297
+ # Use the URL from parentheses
298
+ url = markdown_match.group(2)
299
+
257
300
  # Remove trailing punctuation that's commonly part of sentence structure
258
301
  # but preserve legitimate URL characters
259
302
  url = url.rstrip('.,;!?)')