scitex 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. scitex/__main__.py +24 -5
  2. scitex/__version__.py +1 -1
  3. scitex/_optional_deps.py +33 -0
  4. scitex/ai/classification/reporters/_ClassificationReporter.py +1 -1
  5. scitex/ai/classification/timeseries/_TimeSeriesBlockingSplit.py +2 -2
  6. scitex/ai/classification/timeseries/_TimeSeriesCalendarSplit.py +2 -2
  7. scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit.py +2 -2
  8. scitex/ai/classification/timeseries/_TimeSeriesSlidingWindowSplit_v01-not-using-n_splits.py +2 -2
  9. scitex/ai/classification/timeseries/_TimeSeriesStratifiedSplit.py +2 -2
  10. scitex/ai/classification/timeseries/_normalize_timestamp.py +1 -1
  11. scitex/ai/metrics/_calc_seizure_prediction_metrics.py +1 -1
  12. scitex/ai/plt/_plot_feature_importance.py +1 -1
  13. scitex/ai/plt/_plot_learning_curve.py +1 -1
  14. scitex/ai/plt/_plot_optuna_study.py +1 -1
  15. scitex/ai/plt/_plot_pre_rec_curve.py +1 -1
  16. scitex/ai/plt/_plot_roc_curve.py +1 -1
  17. scitex/ai/plt/_stx_conf_mat.py +1 -1
  18. scitex/ai/training/_LearningCurveLogger.py +1 -1
  19. scitex/audio/mcp_server.py +38 -8
  20. scitex/browser/automation/CookieHandler.py +1 -1
  21. scitex/browser/core/BrowserMixin.py +1 -1
  22. scitex/browser/core/ChromeProfileManager.py +1 -1
  23. scitex/browser/debugging/_browser_logger.py +1 -1
  24. scitex/browser/debugging/_highlight_element.py +1 -1
  25. scitex/browser/debugging/_show_grid.py +1 -1
  26. scitex/browser/interaction/click_center.py +1 -1
  27. scitex/browser/interaction/click_with_fallbacks.py +1 -1
  28. scitex/browser/interaction/close_popups.py +1 -1
  29. scitex/browser/interaction/fill_with_fallbacks.py +1 -1
  30. scitex/browser/pdf/click_download_for_chrome_pdf_viewer.py +1 -1
  31. scitex/browser/pdf/detect_chrome_pdf_viewer.py +1 -1
  32. scitex/browser/stealth/HumanBehavior.py +1 -1
  33. scitex/browser/stealth/StealthManager.py +1 -1
  34. scitex/canvas/_mcp_handlers.py +372 -0
  35. scitex/canvas/_mcp_tool_schemas.py +219 -0
  36. scitex/canvas/mcp_server.py +151 -0
  37. scitex/capture/mcp_server.py +41 -12
  38. scitex/cli/audio.py +233 -0
  39. scitex/cli/capture.py +307 -0
  40. scitex/cli/main.py +27 -4
  41. scitex/cli/repro.py +233 -0
  42. scitex/cli/resource.py +240 -0
  43. scitex/cli/stats.py +325 -0
  44. scitex/cli/template.py +236 -0
  45. scitex/cli/tex.py +286 -0
  46. scitex/cli/web.py +11 -12
  47. scitex/dev/__init__.py +3 -0
  48. scitex/dev/_pyproject.py +405 -0
  49. scitex/dev/plt/__init__.py +2 -2
  50. scitex/dev/plt/mpl/get_dir_ax.py +1 -1
  51. scitex/dev/plt/mpl/get_signatures.py +1 -1
  52. scitex/dev/plt/mpl/get_signatures_details.py +1 -1
  53. scitex/diagram/_mcp_handlers.py +400 -0
  54. scitex/diagram/_mcp_tool_schemas.py +157 -0
  55. scitex/diagram/mcp_server.py +151 -0
  56. scitex/dsp/_demo_sig.py +51 -5
  57. scitex/dsp/_mne.py +13 -2
  58. scitex/dsp/_modulation_index.py +15 -3
  59. scitex/dsp/_pac.py +23 -5
  60. scitex/dsp/_psd.py +16 -4
  61. scitex/dsp/_resample.py +24 -4
  62. scitex/dsp/_transform.py +16 -3
  63. scitex/dsp/add_noise.py +15 -1
  64. scitex/dsp/norm.py +17 -2
  65. scitex/dsp/reference.py +17 -1
  66. scitex/dsp/utils/_differential_bandpass_filters.py +20 -2
  67. scitex/dsp/utils/_zero_pad.py +18 -4
  68. scitex/dt/_normalize_timestamp.py +1 -1
  69. scitex/git/_session.py +1 -1
  70. scitex/io/_load_modules/_con.py +12 -1
  71. scitex/io/_load_modules/_eeg.py +12 -1
  72. scitex/io/_load_modules/_optuna.py +21 -63
  73. scitex/io/_load_modules/_torch.py +11 -3
  74. scitex/io/_save_modules/_optuna_study_as_csv_and_pngs.py +13 -2
  75. scitex/io/_save_modules/_torch.py +11 -3
  76. scitex/mcp_server.py +159 -0
  77. scitex/plt/_mcp_handlers.py +361 -0
  78. scitex/plt/_mcp_tool_schemas.py +169 -0
  79. scitex/plt/mcp_server.py +205 -0
  80. scitex/repro/README_RandomStateManager.md +3 -3
  81. scitex/repro/_RandomStateManager.py +14 -14
  82. scitex/repro/_gen_ID.py +1 -1
  83. scitex/repro/_gen_timestamp.py +1 -1
  84. scitex/repro/_hash_array.py +4 -4
  85. scitex/scholar/__main__.py +24 -2
  86. scitex/scholar/_mcp_handlers.py +685 -0
  87. scitex/scholar/_mcp_tool_schemas.py +339 -0
  88. scitex/scholar/docs/template.py +1 -1
  89. scitex/scholar/examples/07_storage_integration.py +1 -1
  90. scitex/scholar/impact_factor/jcr/ImpactFactorJCREngine.py +1 -1
  91. scitex/scholar/impact_factor/jcr/build_database.py +1 -1
  92. scitex/scholar/mcp_server.py +315 -0
  93. scitex/scholar/pdf_download/ScholarPDFDownloader.py +1 -1
  94. scitex/scholar/pipelines/ScholarPipelineBibTeX.py +1 -1
  95. scitex/scholar/pipelines/ScholarPipelineParallel.py +1 -1
  96. scitex/scholar/pipelines/ScholarPipelineSingle.py +1 -1
  97. scitex/scholar/storage/PaperIO.py +1 -1
  98. scitex/session/README.md +4 -4
  99. scitex/session/__init__.py +1 -1
  100. scitex/session/_decorator.py +9 -9
  101. scitex/session/_lifecycle.py +5 -5
  102. scitex/session/template.py +1 -1
  103. scitex/stats/__main__.py +281 -0
  104. scitex/stats/_mcp_handlers.py +1191 -0
  105. scitex/stats/_mcp_tool_schemas.py +384 -0
  106. scitex/stats/correct/_correct_bonferroni.py +1 -1
  107. scitex/stats/correct/_correct_fdr.py +1 -1
  108. scitex/stats/correct/_correct_fdr_.py +1 -1
  109. scitex/stats/correct/_correct_holm.py +1 -1
  110. scitex/stats/correct/_correct_sidak.py +1 -1
  111. scitex/stats/effect_sizes/_cliffs_delta.py +1 -1
  112. scitex/stats/effect_sizes/_cohens_d.py +1 -1
  113. scitex/stats/effect_sizes/_epsilon_squared.py +1 -1
  114. scitex/stats/effect_sizes/_eta_squared.py +1 -1
  115. scitex/stats/effect_sizes/_prob_superiority.py +1 -1
  116. scitex/stats/mcp_server.py +405 -0
  117. scitex/stats/posthoc/_dunnett.py +1 -1
  118. scitex/stats/posthoc/_games_howell.py +1 -1
  119. scitex/stats/posthoc/_tukey_hsd.py +1 -1
  120. scitex/stats/power/_power.py +1 -1
  121. scitex/stats/utils/_effect_size.py +1 -1
  122. scitex/stats/utils/_formatters.py +1 -1
  123. scitex/stats/utils/_power.py +1 -1
  124. scitex/template/_mcp_handlers.py +259 -0
  125. scitex/template/_mcp_tool_schemas.py +112 -0
  126. scitex/template/mcp_server.py +186 -0
  127. scitex/utils/_verify_scitex_format.py +2 -2
  128. scitex/utils/template.py +1 -1
  129. scitex/web/__init__.py +12 -11
  130. scitex/web/_scraping.py +26 -265
  131. scitex/web/download_images.py +316 -0
  132. scitex/writer/Writer.py +1 -1
  133. scitex/writer/_clone_writer_project.py +1 -1
  134. scitex/writer/_validate_tree_structures.py +1 -1
  135. scitex/writer/dataclasses/config/_WriterConfig.py +1 -1
  136. scitex/writer/dataclasses/contents/_ManuscriptContents.py +1 -1
  137. scitex/writer/dataclasses/core/_Document.py +1 -1
  138. scitex/writer/dataclasses/core/_DocumentSection.py +1 -1
  139. scitex/writer/dataclasses/results/_CompilationResult.py +1 -1
  140. scitex/writer/dataclasses/results/_LaTeXIssue.py +1 -1
  141. scitex/writer/utils/.legacy_git_retry.py +7 -5
  142. scitex/writer/utils/_parse_latex_logs.py +1 -1
  143. {scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/METADATA +431 -269
  144. {scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/RECORD +147 -118
  145. scitex-2.13.0.dist-info/entry_points.txt +11 -0
  146. scitex-2.11.0.dist-info/entry_points.txt +0 -2
  147. {scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/WHEEL +0 -0
  148. {scitex-2.11.0.dist-info → scitex-2.13.0.dist-info}/licenses/LICENSE +0 -0
scitex/web/_scraping.py CHANGED
@@ -1,40 +1,21 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  # File: ./src/scitex/web/_scraping.py
4
3
 
5
- """Web scraping utilities for extracting URLs and downloading images."""
4
+ """Web scraping utilities for extracting URLs."""
6
5
 
7
- import os
8
6
  import re
9
7
  import urllib.parse
10
- from datetime import datetime
11
- from pathlib import Path
12
- from typing import List, Optional, Set, Tuple
13
- from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import List, Optional, Set
14
9
 
15
10
  import requests
16
11
  from bs4 import BeautifulSoup
17
- from tqdm import tqdm
18
-
19
- try:
20
- from PIL import Image
21
- from io import BytesIO
22
-
23
- PILLOW_AVAILABLE = True
24
- except ImportError:
25
- PILLOW_AVAILABLE = False
26
12
 
27
13
  from scitex.logging import getLogger
28
14
 
29
15
  logger = getLogger(__name__)
30
16
 
31
-
32
- def _get_default_download_dir() -> str:
33
- """Get default download directory using SCITEX_DIR if available."""
34
- scitex_root = os.environ.get("SCITEX_DIR")
35
- if scitex_root is None:
36
- scitex_root = os.path.expanduser("~/.scitex")
37
- return os.path.join(scitex_root, "web", "downloads")
17
+ DEFAULT_TIMEOUT = 10
18
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
38
19
 
39
20
 
40
21
  def get_urls(
@@ -49,7 +30,7 @@ def get_urls(
49
30
 
50
31
  Args:
51
32
  url: The URL of the webpage to scrape
52
- pattern: Optional regex pattern to filter URLs (e.g., r'\.pdf$' for PDF files)
33
+ pattern: Optional regex pattern to filter URLs (e.g., r'\\.pdf$' for PDF files)
53
34
  absolute: If True, convert relative URLs to absolute URLs
54
35
  same_domain: If True, only return URLs from the same domain
55
36
  include_external: If True, include external links (only applies if same_domain=False)
@@ -58,12 +39,16 @@ def get_urls(
58
39
  List of URLs found on the page
59
40
 
60
41
  Example:
61
- >>> urls = get_urls('https://example.com', pattern=r'\.pdf$')
42
+ >>> urls = get_urls('https://example.com', pattern=r'\\.pdf$')
62
43
  >>> urls = get_urls('https://example.com', same_domain=True)
63
44
  """
64
45
  try:
65
46
  logger.info(f"Fetching URLs from: {url}")
66
- response = requests.get(url, timeout=30)
47
+ response = requests.get(
48
+ url,
49
+ timeout=DEFAULT_TIMEOUT,
50
+ headers={"User-Agent": DEFAULT_USER_AGENT},
51
+ )
67
52
  response.raise_for_status()
68
53
  except requests.RequestException as e:
69
54
  logger.error(f"Failed to fetch URL {url}: {e}")
@@ -72,19 +57,14 @@ def get_urls(
72
57
  soup = BeautifulSoup(response.text, "html.parser")
73
58
  urls_found: Set[str] = set()
74
59
 
75
- # Parse the base domain
76
60
  parsed_base = urllib.parse.urlparse(url)
77
- base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"
78
61
 
79
- # Find all links
80
62
  for link in soup.find_all("a", href=True):
81
63
  href = link["href"]
82
64
 
83
- # Convert to absolute URL if requested
84
65
  if absolute:
85
66
  href = urllib.parse.urljoin(url, href)
86
67
 
87
- # Filter by domain if requested
88
68
  if same_domain:
89
69
  parsed_href = urllib.parse.urlparse(href)
90
70
  if parsed_href.netloc != parsed_base.netloc:
@@ -94,10 +74,8 @@ def get_urls(
94
74
  if parsed_href.netloc and parsed_href.netloc != parsed_base.netloc:
95
75
  continue
96
76
 
97
- # Filter by pattern if provided
98
- if pattern:
99
- if not re.search(pattern, href):
100
- continue
77
+ if pattern and not re.search(pattern, href):
78
+ continue
101
79
 
102
80
  urls_found.add(href)
103
81
 
@@ -106,171 +84,6 @@ def get_urls(
106
84
  return result
107
85
 
108
86
 
109
- def download_images(
110
- url: str,
111
- output_dir: Optional[str] = None,
112
- pattern: Optional[str] = None,
113
- min_size: Optional[Tuple[int, int]] = None,
114
- max_workers: int = 5,
115
- same_domain: bool = False,
116
- ) -> List[str]:
117
- """
118
- Download all images from a webpage.
119
-
120
- Args:
121
- url: The URL of the webpage to scrape
122
- output_dir: Directory to save images. Priority:
123
- 1. This parameter if specified
124
- 2. $SCITEX_WEB_DOWNLOADS_DIR environment variable
125
- 3. $SCITEX_DIR/web/downloads (default)
126
- pattern: Optional regex pattern to filter image URLs
127
- min_size: Optional minimum size as (width, height) tuple to filter images
128
- max_workers: Number of concurrent download threads
129
- same_domain: If True, only download images from the same domain
130
-
131
- Returns:
132
- List of paths to downloaded images
133
-
134
- Note:
135
- - SVG files are automatically skipped (vector graphics)
136
- - Images are saved in timestamped subdirectories: images-YYYYMMDD_HHMMSS/
137
-
138
- Example:
139
- >>> paths = download_images('https://example.com', output_dir='./downloads')
140
- >>> paths = download_images('https://example.com', min_size=(100, 100))
141
- >>> # Uses $SCITEX_WEB_DOWNLOADS_DIR or $SCITEX_DIR/web/downloads
142
- >>> paths = download_images('https://example.com')
143
- """
144
- if not PILLOW_AVAILABLE:
145
- logger.warning("Pillow is not available. min_size filtering will be disabled.")
146
-
147
- # Set default output directory
148
- if output_dir is None:
149
- # Check SCITEX_WEB_DOWNLOADS_DIR first
150
- output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
151
- if output_dir is None:
152
- # Fall back to SCITEX_DIR/web/downloads
153
- output_dir = _get_default_download_dir()
154
-
155
- # Create timestamped subdirectory
156
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
157
- output_path = Path(output_dir).expanduser() / f"images-{timestamp}"
158
- output_path.mkdir(parents=True, exist_ok=True)
159
-
160
- logger.info(f"Saving images to: {output_path}")
161
-
162
- try:
163
- logger.info(f"Fetching page: {url}")
164
- response = requests.get(url, timeout=30)
165
- response.raise_for_status()
166
- except requests.RequestException as e:
167
- logger.error(f"Failed to fetch URL {url}: {e}")
168
- return []
169
-
170
- soup = BeautifulSoup(response.text, "html.parser")
171
- image_urls: Set[str] = set()
172
-
173
- # Parse the base domain
174
- parsed_base = urllib.parse.urlparse(url)
175
-
176
- # Find all image tags
177
- for img in soup.find_all("img", src=True):
178
- img_url = img["src"]
179
-
180
- # Convert to absolute URL
181
- img_url = urllib.parse.urljoin(url, img_url)
182
-
183
- # Skip SVG files (vector graphics, not raster images)
184
- if img_url.lower().endswith((".svg", ".svgz")):
185
- continue
186
-
187
- # Filter by domain if requested
188
- if same_domain:
189
- parsed_img = urllib.parse.urlparse(img_url)
190
- if parsed_img.netloc != parsed_base.netloc:
191
- continue
192
-
193
- # Filter by pattern if provided
194
- if pattern:
195
- if not re.search(pattern, img_url):
196
- continue
197
-
198
- image_urls.add(img_url)
199
-
200
- logger.info(f"Found {len(image_urls)} images")
201
-
202
- # Download images
203
- downloaded_paths = []
204
-
205
- def download_image(img_url: str) -> Optional[str]:
206
- try:
207
- img_response = requests.get(img_url, timeout=30)
208
- img_response.raise_for_status()
209
-
210
- # Check image size if requested and Pillow is available
211
- if min_size and PILLOW_AVAILABLE:
212
- try:
213
- img = Image.open(BytesIO(img_response.content))
214
- if img.size[0] < min_size[0] or img.size[1] < min_size[1]:
215
- return None
216
- except Exception:
217
- pass
218
-
219
- # Generate filename from URL
220
- parsed_url = urllib.parse.urlparse(img_url)
221
- filename = Path(parsed_url.path).name
222
-
223
- # If filename is empty or doesn't have extension, generate one
224
- if not filename or "." not in filename:
225
- ext = ".jpg" # default extension
226
- if "content-type" in img_response.headers:
227
- content_type = img_response.headers["content-type"]
228
- if "png" in content_type:
229
- ext = ".png"
230
- elif "gif" in content_type:
231
- ext = ".gif"
232
- elif "webp" in content_type:
233
- ext = ".webp"
234
- filename = f"image_{hash(img_url)}{ext}"
235
-
236
- # Save image
237
- file_path = output_path / filename
238
-
239
- # Handle duplicate filenames
240
- counter = 1
241
- original_stem = file_path.stem
242
- while file_path.exists():
243
- file_path = output_path / f"{original_stem}_{counter}{file_path.suffix}"
244
- counter += 1
245
-
246
- with open(file_path, "wb") as f:
247
- f.write(img_response.content)
248
-
249
- return str(file_path)
250
-
251
- except Exception as e:
252
- logger.warning(f"Failed to download image {img_url}: {e}")
253
- return None
254
-
255
- # Download images concurrently
256
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
257
- future_to_url = {
258
- executor.submit(download_image, img_url): img_url for img_url in image_urls
259
- }
260
-
261
- for future in tqdm(
262
- as_completed(future_to_url),
263
- total=len(image_urls),
264
- desc="Downloading images",
265
- ):
266
- result = future.result()
267
- if result:
268
- downloaded_paths.append(result)
269
-
270
- logger.info(f"Downloaded {len(downloaded_paths)} images to {output_dir}")
271
- return downloaded_paths
272
-
273
-
274
87
  def get_image_urls(
275
88
  url: str,
276
89
  pattern: Optional[str] = None,
@@ -289,14 +102,19 @@ def get_image_urls(
289
102
 
290
103
  Note:
291
104
  - SVG files are automatically skipped (vector graphics)
105
+ - Checks both 'src' and 'data-src' attributes for lazy-loaded images
292
106
 
293
107
  Example:
294
108
  >>> img_urls = get_image_urls('https://example.com')
295
- >>> img_urls = get_image_urls('https://example.com', pattern=r'\.png$')
109
+ >>> img_urls = get_image_urls('https://example.com', pattern=r'\\.png$')
296
110
  """
297
111
  try:
298
112
  logger.info(f"Fetching image URLs from: {url}")
299
- response = requests.get(url, timeout=30)
113
+ response = requests.get(
114
+ url,
115
+ timeout=DEFAULT_TIMEOUT,
116
+ headers={"User-Agent": DEFAULT_USER_AGENT},
117
+ )
300
118
  response.raise_for_status()
301
119
  except requests.RequestException as e:
302
120
  logger.error(f"Failed to fetch URL {url}: {e}")
@@ -305,85 +123,28 @@ def get_image_urls(
305
123
  soup = BeautifulSoup(response.text, "html.parser")
306
124
  image_urls: Set[str] = set()
307
125
 
308
- # Parse the base domain
309
126
  parsed_base = urllib.parse.urlparse(url)
310
127
 
311
- # Find all image tags
312
- for img in soup.find_all("img", src=True):
313
- img_url = img["src"]
128
+ for img in soup.find_all("img"):
129
+ img_url = img.get("src") or img.get("data-src")
130
+ if not img_url:
131
+ continue
314
132
 
315
- # Convert to absolute URL
316
133
  img_url = urllib.parse.urljoin(url, img_url)
317
134
 
318
- # Skip SVG files (vector graphics, not raster images)
319
135
  if img_url.lower().endswith((".svg", ".svgz")):
320
136
  continue
321
137
 
322
- # Filter by domain if requested
323
138
  if same_domain:
324
139
  parsed_img = urllib.parse.urlparse(img_url)
325
140
  if parsed_img.netloc != parsed_base.netloc:
326
141
  continue
327
142
 
328
- # Filter by pattern if provided
329
- if pattern:
330
- if not re.search(pattern, img_url):
331
- continue
143
+ if pattern and not re.search(pattern, img_url):
144
+ continue
332
145
 
333
146
  image_urls.add(img_url)
334
147
 
335
148
  result = sorted(list(image_urls))
336
149
  logger.info(f"Found {len(result)} image URLs")
337
150
  return result
338
-
339
-
340
- if __name__ == "__main__":
341
- import argparse
342
-
343
- parser = argparse.ArgumentParser(description="Web scraping utilities")
344
- parser.add_argument("url", type=str, help="URL to scrape")
345
- parser.add_argument(
346
- "--mode",
347
- "-m",
348
- choices=["urls", "images", "image_urls"],
349
- default="urls",
350
- help="Scraping mode",
351
- )
352
- parser.add_argument("--output", "-o", type=str, help="Output directory for images")
353
- parser.add_argument(
354
- "--pattern", "-p", type=str, help="Regex pattern to filter URLs"
355
- )
356
- parser.add_argument(
357
- "--same-domain", action="store_true", help="Only include URLs from same domain"
358
- )
359
- parser.add_argument(
360
- "--min-size", type=str, help="Minimum image size as WIDTHxHEIGHT"
361
- )
362
-
363
- args = parser.parse_args()
364
-
365
- if args.mode == "urls":
366
- urls = get_urls(args.url, pattern=args.pattern, same_domain=args.same_domain)
367
- for url in urls:
368
- print(url)
369
- elif args.mode == "images":
370
- min_size = None
371
- if args.min_size:
372
- width, height = map(int, args.min_size.split("x"))
373
- min_size = (width, height)
374
-
375
- paths = download_images(
376
- args.url,
377
- output_dir=args.output,
378
- pattern=args.pattern,
379
- min_size=min_size,
380
- same_domain=args.same_domain,
381
- )
382
- for path in paths:
383
- print(path)
384
- elif args.mode == "image_urls":
385
- img_urls = get_image_urls(
386
- args.url, pattern=args.pattern, same_domain=args.same_domain
387
- )
388
- for img_url in img_urls:
389
- print(img_url)
@@ -0,0 +1,316 @@
1
+ #!/usr/bin/env python3
2
+ # File: ./src/scitex/web/download_images.py
3
+
4
+ """
5
+ Image Downloader for SciTeX.
6
+
7
+ Downloads images from URLs with minimum size filtering.
8
+
9
+ Usage:
10
+ python -m scitex.web.download_images https://example.com
11
+ python -m scitex.web.download_images https://example.com -o ./downloads
12
+ python -m scitex.web.download_images https://example.com --min-size 800x600
13
+ """
14
+
15
+ import os
16
+ import re
17
+ import urllib.parse
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+ from typing import List, Optional, Tuple
22
+
23
+ import requests
24
+ from bs4 import BeautifulSoup
25
+ from tqdm import tqdm
26
+
27
+ try:
28
+ from io import BytesIO
29
+
30
+ from PIL import Image
31
+
32
+ PILLOW_AVAILABLE = True
33
+ except ImportError:
34
+ PILLOW_AVAILABLE = False
35
+
36
+ from scitex.logging import getLogger
37
+
38
+ logger = getLogger(__name__)
39
+
40
+ # Configuration
41
+ DEFAULT_MIN_WIDTH = 400
42
+ DEFAULT_MIN_HEIGHT = 300
43
+ DEFAULT_TIMEOUT = 10
44
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
45
+
46
+
47
+ def _get_default_download_dir() -> str:
48
+ """Get default download directory using SCITEX_DIR if available."""
49
+ scitex_root = os.environ.get("SCITEX_DIR", os.path.expanduser("~/.scitex"))
50
+ return os.path.join(scitex_root, "web", "downloads")
51
+
52
+
53
+ def _normalize_url_for_directory(url: str) -> str:
54
+ """Convert URL to a safe directory name."""
55
+ parsed = urllib.parse.urlparse(url)
56
+ domain = parsed.netloc.replace("www.", "")
57
+ path = parsed.path.strip("/").replace("/", "-")
58
+
59
+ normalized = f"{domain}-{path}" if path else domain
60
+ normalized = re.sub(r"[^\w\-.]", "-", normalized)
61
+ normalized = re.sub(r"-+", "-", normalized)
62
+ normalized = normalized[:100].strip("-")
63
+
64
+ return normalized
65
+
66
+
67
+ def _is_direct_image_url(url: str) -> bool:
68
+ """Check if URL appears to be a direct image link."""
69
+ extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
70
+ path = urllib.parse.urlparse(url.lower()).path
71
+ return any(path.endswith(ext) for ext in extensions)
72
+
73
+
74
+ def _extract_image_urls(url: str, same_domain: bool = False) -> List[str]:
75
+ """Extract image URLs from a webpage."""
76
+ try:
77
+ logger.info(f"Fetching page: {url}")
78
+ response = requests.get(
79
+ url,
80
+ timeout=DEFAULT_TIMEOUT,
81
+ headers={"User-Agent": DEFAULT_USER_AGENT},
82
+ )
83
+ response.raise_for_status()
84
+ except requests.RequestException as e:
85
+ logger.error(f"Failed to fetch page: {e}")
86
+ return []
87
+
88
+ soup = BeautifulSoup(response.content, "html.parser")
89
+ parsed_base = urllib.parse.urlparse(url)
90
+ image_urls = set()
91
+
92
+ for img in soup.find_all("img"):
93
+ img_url = img.get("src") or img.get("data-src")
94
+ if not img_url:
95
+ continue
96
+
97
+ img_url = urllib.parse.urljoin(url, img_url)
98
+
99
+ if img_url.lower().endswith((".svg", ".svgz")):
100
+ continue
101
+
102
+ if same_domain:
103
+ parsed_img = urllib.parse.urlparse(img_url)
104
+ if parsed_img.netloc != parsed_base.netloc:
105
+ continue
106
+
107
+ image_urls.add(img_url)
108
+
109
+ logger.info(f"Found {len(image_urls)} images on page")
110
+ return list(image_urls)
111
+
112
+
113
+ def _download_single_image(
114
+ img_url: str,
115
+ output_dir: Path,
116
+ counter: int,
117
+ min_size: Optional[Tuple[int, int]],
118
+ ) -> Optional[str]:
119
+ """Download a single image."""
120
+ try:
121
+ response = requests.get(
122
+ img_url,
123
+ timeout=DEFAULT_TIMEOUT,
124
+ headers={"User-Agent": DEFAULT_USER_AGENT},
125
+ )
126
+ response.raise_for_status()
127
+
128
+ # Validate content-type
129
+ content_type = response.headers.get("content-type", "")
130
+ if not content_type.startswith("image/"):
131
+ logger.debug(f"Skipping non-image: {content_type}")
132
+ return None
133
+
134
+ # Check dimensions
135
+ if min_size and PILLOW_AVAILABLE:
136
+ try:
137
+ img = Image.open(BytesIO(response.content))
138
+ width, height = img.size
139
+ if width < min_size[0] or height < min_size[1]:
140
+ logger.debug(
141
+ f"Skipping small image: {width}x{height} "
142
+ f"(min: {min_size[0]}x{min_size[1]})"
143
+ )
144
+ return None
145
+ except Exception:
146
+ pass
147
+
148
+ # Determine extension
149
+ ext = "jpg"
150
+ if PILLOW_AVAILABLE:
151
+ try:
152
+ img = Image.open(BytesIO(response.content))
153
+ fmt = img.format.lower() if img.format else "jpeg"
154
+ ext = "jpg" if fmt == "jpeg" else fmt
155
+ except Exception:
156
+ pass
157
+ elif "png" in content_type:
158
+ ext = "png"
159
+ elif "gif" in content_type:
160
+ ext = "gif"
161
+ elif "webp" in content_type:
162
+ ext = "webp"
163
+
164
+ filename = f"{counter:04d}.{ext}"
165
+ filepath = output_dir / filename
166
+
167
+ with open(filepath, "wb") as f:
168
+ f.write(response.content)
169
+
170
+ logger.info(f"Downloaded: {filename}")
171
+ return str(filepath)
172
+
173
+ except Exception as e:
174
+ logger.warning(f"Error downloading {img_url}: {e}")
175
+ return None
176
+
177
+
178
+ def download_images(
179
+ url: str,
180
+ output_dir: Optional[str] = None,
181
+ min_size: Optional[Tuple[int, int]] = None,
182
+ max_workers: int = 5,
183
+ same_domain: bool = False,
184
+ ) -> List[str]:
185
+ """
186
+ Download images from a URL.
187
+
188
+ Args:
189
+ url: Webpage URL or direct image URL
190
+ output_dir: Output directory (default: $SCITEX_DIR/web/downloads)
191
+ min_size: Minimum (width, height) to filter small images (default: 400x300)
192
+ max_workers: Concurrent download threads
193
+ same_domain: Only download images from the same domain
194
+
195
+ Returns:
196
+ List of downloaded file paths
197
+
198
+ Example:
199
+ >>> paths = download_images("https://example.com")
200
+ >>> paths = download_images("https://example.com/photo.jpg")
201
+ >>> paths = download_images("https://example.com", min_size=(800, 600))
202
+ """
203
+ if not PILLOW_AVAILABLE:
204
+ logger.warning("Pillow not available. Size filtering disabled.")
205
+ min_size = None
206
+ elif min_size is None:
207
+ min_size = (DEFAULT_MIN_WIDTH, DEFAULT_MIN_HEIGHT)
208
+
209
+ # Setup output directory
210
+ if output_dir is None:
211
+ output_dir = os.environ.get("SCITEX_WEB_DOWNLOADS_DIR")
212
+ if output_dir is None:
213
+ output_dir = _get_default_download_dir()
214
+
215
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
216
+ normalized = _normalize_url_for_directory(url)
217
+ output_path = Path(output_dir).expanduser() / f"{timestamp}-{normalized}-images"
218
+ output_path.mkdir(parents=True, exist_ok=True)
219
+
220
+ logger.info(f"Output directory: {output_path}")
221
+
222
+ # Get image URLs
223
+ if _is_direct_image_url(url):
224
+ image_urls = [url]
225
+ logger.info("Direct image URL detected")
226
+ else:
227
+ image_urls = _extract_image_urls(url, same_domain=same_domain)
228
+
229
+ if not image_urls:
230
+ logger.warning("No images found")
231
+ return []
232
+
233
+ # Download concurrently
234
+ downloaded = []
235
+ counter = [1]
236
+
237
+ def download_with_counter(img_url: str) -> Optional[str]:
238
+ idx = counter[0]
239
+ counter[0] += 1
240
+ return _download_single_image(img_url, output_path, idx, min_size)
241
+
242
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
243
+ futures = {executor.submit(download_with_counter, u): u for u in image_urls}
244
+
245
+ for future in tqdm(
246
+ as_completed(futures), total=len(image_urls), desc="Downloading"
247
+ ):
248
+ result = future.result()
249
+ if result:
250
+ downloaded.append(result)
251
+
252
+ logger.info(f"Downloaded {len(downloaded)} images to {output_path}")
253
+ return downloaded
254
+
255
+
256
+ def main():
257
+ """CLI entry point."""
258
+ import argparse
259
+
260
+ parser = argparse.ArgumentParser(
261
+ description="Download images from URL",
262
+ formatter_class=argparse.RawDescriptionHelpFormatter,
263
+ epilog="""
264
+ Examples:
265
+ python -m scitex.web.download_images https://example.com
266
+ python -m scitex.web.download_images https://example.com -o ./downloads
267
+ python -m scitex.web.download_images https://example.com --min-size 800x600
268
+ python -m scitex.web.download_images https://example.com --no-min-size
269
+ """,
270
+ )
271
+ parser.add_argument("url", help="URL to download images from")
272
+ parser.add_argument("-o", "--output", help="Output directory")
273
+ parser.add_argument(
274
+ "--min-size",
275
+ default="400x300",
276
+ help="Minimum size WIDTHxHEIGHT (default: 400x300)",
277
+ )
278
+ parser.add_argument(
279
+ "--no-min-size",
280
+ action="store_true",
281
+ help="Disable size filtering",
282
+ )
283
+ parser.add_argument(
284
+ "--same-domain",
285
+ action="store_true",
286
+ help="Only download from same domain",
287
+ )
288
+ parser.add_argument(
289
+ "--workers",
290
+ type=int,
291
+ default=5,
292
+ help="Concurrent downloads (default: 5)",
293
+ )
294
+
295
+ args = parser.parse_args()
296
+
297
+ min_size = None
298
+ if not args.no_min_size and args.min_size:
299
+ w, h = map(int, args.min_size.split("x"))
300
+ min_size = (w, h)
301
+
302
+ paths = download_images(
303
+ args.url,
304
+ output_dir=args.output,
305
+ min_size=min_size,
306
+ max_workers=args.workers,
307
+ same_domain=args.same_domain,
308
+ )
309
+
310
+ print(f"\nDownloaded {len(paths)} images:")
311
+ for p in paths:
312
+ print(f" {p}")
313
+
314
+
315
+ if __name__ == "__main__":
316
+ main()