sf-config-builder 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sf_config_builder-0.1.1.dist-info/METADATA +316 -0
- sf_config_builder-0.1.1.dist-info/RECORD +10 -0
- sf_config_builder-0.1.1.dist-info/WHEEL +5 -0
- sf_config_builder-0.1.1.dist-info/licenses/LICENSE +21 -0
- sf_config_builder-0.1.1.dist-info/top_level.txt +1 -0
- sfconfig/__init__.py +34 -0
- sfconfig/config.py +767 -0
- sfconfig/diff.py +145 -0
- sfconfig/exceptions.py +26 -0
- sfconfig/paths.py +217 -0
sfconfig/config.py
ADDED
|
@@ -0,0 +1,767 @@
|
|
|
1
|
+
"""SFConfig class for managing Screaming Frog configuration files."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from .diff import SFDiff
|
|
9
|
+
from .exceptions import SFConfigError, SFCrawlError, SFParseError, SFValidationError
|
|
10
|
+
from .paths import get_classpath_separator, get_java_path, get_sf_cli_path, get_sf_jar_path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SFConfig:
|
|
14
|
+
"""Manage Screaming Frog configuration files.
|
|
15
|
+
|
|
16
|
+
This class wraps the Java ConfigBuilder CLI to provide a Pythonic interface
|
|
17
|
+
for inspecting, modifying, and using .seospiderconfig files.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> config = SFConfig.load("base.seospiderconfig")
|
|
21
|
+
>>> config.max_urls = 100000
|
|
22
|
+
>>> config.add_extraction("Price", "//span[@class='price']")
|
|
23
|
+
>>> config.save("client.seospiderconfig")
|
|
24
|
+
>>> config.run_crawl("https://example.com", output_folder="./results")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
JAR_PATH = Path(__file__).parent / "java" / "ConfigBuilder.jar"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
data: Dict[str, Any],
|
|
32
|
+
path: Optional[str] = None,
|
|
33
|
+
sf_path: Optional[str] = None,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize SFConfig with inspection data.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
data: Parsed JSON response from Java CLI --inspect command.
|
|
39
|
+
path: Path to the source config file.
|
|
40
|
+
sf_path: Optional custom path to SF installation directory.
|
|
41
|
+
"""
|
|
42
|
+
self._data = data
|
|
43
|
+
self._path = path
|
|
44
|
+
self._sf_path = sf_path
|
|
45
|
+
self._patches: Dict[str, Any] = {}
|
|
46
|
+
self._extraction_ops: List[Dict[str, Any]] = []
|
|
47
|
+
self._exclude_ops: List[Dict[str, Any]] = []
|
|
48
|
+
self._include_ops: List[Dict[str, Any]] = []
|
|
49
|
+
|
|
50
|
+
# ==================== Loading ====================
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def load(cls, path: str, sf_path: Optional[str] = None) -> "SFConfig":
|
|
54
|
+
"""Load a config file.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
path: Path to the .seospiderconfig file.
|
|
58
|
+
sf_path: Optional custom path to SF installation directory.
|
|
59
|
+
Auto-detects if not provided.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
SFConfig instance with loaded configuration.
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
SFParseError: If the config file cannot be parsed.
|
|
66
|
+
SFNotFoundError: If Screaming Frog is not installed.
|
|
67
|
+
"""
|
|
68
|
+
result = cls._run_java("--inspect", "--config", str(path), sf_path=sf_path)
|
|
69
|
+
return cls(result, str(path), sf_path=sf_path)
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def default(cls) -> "SFConfig":
|
|
73
|
+
"""Create config from SF's default settings.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
SFConfig instance with default SF configuration.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
NotImplementedError: This feature is not yet implemented.
|
|
80
|
+
"""
|
|
81
|
+
from .paths import get_default_config_path
|
|
82
|
+
|
|
83
|
+
default_path = get_default_config_path()
|
|
84
|
+
if default_path:
|
|
85
|
+
return cls.load(str(default_path))
|
|
86
|
+
|
|
87
|
+
raise NotImplementedError(
|
|
88
|
+
"Default config not found. "
|
|
89
|
+
"Please load an existing config file instead."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# ==================== Inspection ====================
|
|
93
|
+
|
|
94
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
95
|
+
"""Return full config as dictionary.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dictionary containing all config data from the Java CLI.
|
|
99
|
+
"""
|
|
100
|
+
return self._data
|
|
101
|
+
|
|
102
|
+
def get(self, path: str, default: Any = None) -> Any:
|
|
103
|
+
"""Get a specific field value.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
path: Dot-separated field path (e.g., "mCrawlConfig.mMaxUrls").
|
|
107
|
+
default: Value to return if field is not found.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
The field value, or default if not found.
|
|
111
|
+
"""
|
|
112
|
+
# Check patches first (pending changes take precedence)
|
|
113
|
+
if path in self._patches:
|
|
114
|
+
return self._patches[path]
|
|
115
|
+
|
|
116
|
+
# Search in loaded data
|
|
117
|
+
for field in self._data.get("fields", []):
|
|
118
|
+
if field.get("path") == path:
|
|
119
|
+
return field.get("value")
|
|
120
|
+
|
|
121
|
+
return default
|
|
122
|
+
|
|
123
|
+
def fields(self, prefix: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
124
|
+
"""List all fields, optionally filtered by prefix.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
prefix: Optional path prefix to filter fields.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
List of field dictionaries containing path, type, value, etc.
|
|
131
|
+
"""
|
|
132
|
+
fields = self._data.get("fields", [])
|
|
133
|
+
if prefix:
|
|
134
|
+
fields = [f for f in fields if f.get("path", "").startswith(prefix)]
|
|
135
|
+
return fields
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def sf_version(self) -> str:
|
|
139
|
+
"""Get the Screaming Frog version that created this config."""
|
|
140
|
+
return self._data.get("sfVersion", "unknown")
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def config_version(self) -> str:
|
|
144
|
+
"""Get the config file version."""
|
|
145
|
+
return self._data.get("configVersion", "unknown")
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def path(self) -> Optional[str]:
|
|
149
|
+
"""Get the path to the loaded config file."""
|
|
150
|
+
return self._path
|
|
151
|
+
|
|
152
|
+
# ==================== Modification ====================
|
|
153
|
+
|
|
154
|
+
def set(self, path: str, value: Any) -> "SFConfig":
|
|
155
|
+
"""Set a field value.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
path: Dot-separated field path (e.g., "mCrawlConfig.mMaxUrls").
|
|
159
|
+
value: The value to set.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Self for method chaining.
|
|
163
|
+
"""
|
|
164
|
+
self._patches[path] = value
|
|
165
|
+
return self
|
|
166
|
+
|
|
167
|
+
# Convenience properties
|
|
168
|
+
@property
|
|
169
|
+
def max_urls(self) -> int:
|
|
170
|
+
"""Get the maximum URLs to crawl."""
|
|
171
|
+
return self.get("mCrawlConfig.mMaxUrls", 0)
|
|
172
|
+
|
|
173
|
+
@max_urls.setter
|
|
174
|
+
def max_urls(self, value: int):
|
|
175
|
+
"""Set the maximum URLs to crawl."""
|
|
176
|
+
self.set("mCrawlConfig.mMaxUrls", value)
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def rendering_mode(self) -> str:
|
|
180
|
+
"""Get the rendering mode (STATIC or JAVASCRIPT)."""
|
|
181
|
+
return self.get("mCrawlConfig.mRenderingMode", "STATIC")
|
|
182
|
+
|
|
183
|
+
@rendering_mode.setter
|
|
184
|
+
def rendering_mode(self, value: str):
|
|
185
|
+
"""Set the rendering mode (STATIC or JAVASCRIPT)."""
|
|
186
|
+
self.set("mCrawlConfig.mRenderingMode", value)
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def robots_mode(self) -> str:
|
|
190
|
+
"""Get the robots.txt handling mode (RESPECT or IGNORE)."""
|
|
191
|
+
return self.get("mCrawlConfig.mRobotsTxtMode", "RESPECT")
|
|
192
|
+
|
|
193
|
+
@robots_mode.setter
|
|
194
|
+
def robots_mode(self, value: str):
|
|
195
|
+
"""Set the robots.txt handling mode (RESPECT or IGNORE)."""
|
|
196
|
+
self.set("mCrawlConfig.mRobotsTxtMode", value)
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def max_depth(self) -> int:
|
|
200
|
+
"""Get the maximum crawl depth."""
|
|
201
|
+
return self.get("mCrawlConfig.mMaxDepth", 0)
|
|
202
|
+
|
|
203
|
+
@max_depth.setter
|
|
204
|
+
def max_depth(self, value: int):
|
|
205
|
+
"""Set the maximum crawl depth."""
|
|
206
|
+
self.set("mCrawlConfig.mMaxDepth", value)
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def crawl_delay(self) -> float:
|
|
210
|
+
"""Get the crawl delay in seconds."""
|
|
211
|
+
return self.get("mCrawlConfig.mCrawlDelay", 0.0)
|
|
212
|
+
|
|
213
|
+
@crawl_delay.setter
|
|
214
|
+
def crawl_delay(self, value: float):
|
|
215
|
+
"""Set the crawl delay in seconds."""
|
|
216
|
+
self.set("mCrawlConfig.mCrawlDelay", value)
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def user_agent(self) -> str:
|
|
220
|
+
"""Get the user agent string."""
|
|
221
|
+
return self.get("mUserAgentConfig.mUserAgent", "")
|
|
222
|
+
|
|
223
|
+
@user_agent.setter
|
|
224
|
+
def user_agent(self, value: str):
|
|
225
|
+
"""Set the user agent string."""
|
|
226
|
+
self.set("mUserAgentConfig.mUserAgent", value)
|
|
227
|
+
|
|
228
|
+
# ==================== Extractions ====================
|
|
229
|
+
|
|
230
|
+
def add_extraction(
|
|
231
|
+
self,
|
|
232
|
+
name: str,
|
|
233
|
+
selector: str,
|
|
234
|
+
selector_type: str = "XPATH",
|
|
235
|
+
extract_mode: str = "TEXT",
|
|
236
|
+
attribute: Optional[str] = None,
|
|
237
|
+
) -> "SFConfig":
|
|
238
|
+
"""Add a custom extraction rule.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
name: Name for the extraction (appears as column header in exports).
|
|
242
|
+
selector: The selector pattern (XPath, CSS, or Regex).
|
|
243
|
+
selector_type: Type of selector - "XPATH", "CSS", or "REGEX".
|
|
244
|
+
extract_mode: What to extract - "TEXT", "HTML_ELEMENT", "INNER_HTML",
|
|
245
|
+
or "FUNCTION_VALUE".
|
|
246
|
+
attribute: Optional attribute to extract (for ATTRIBUTE mode).
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Self for method chaining.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
>>> config.add_extraction("Price", "//span[@class='price']")
|
|
253
|
+
>>> config.add_extraction("SKU", ".sku-code", selector_type="CSS")
|
|
254
|
+
"""
|
|
255
|
+
op = {
|
|
256
|
+
"op": "add",
|
|
257
|
+
"name": name,
|
|
258
|
+
"selector": selector,
|
|
259
|
+
"selectorType": selector_type.upper(),
|
|
260
|
+
"extractMode": extract_mode.upper(),
|
|
261
|
+
}
|
|
262
|
+
if attribute:
|
|
263
|
+
op["attribute"] = attribute
|
|
264
|
+
self._extraction_ops.append(op)
|
|
265
|
+
return self
|
|
266
|
+
|
|
267
|
+
def remove_extraction(self, name: str) -> "SFConfig":
|
|
268
|
+
"""Remove an extraction rule by name.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
name: Name of the extraction rule to remove.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Self for method chaining.
|
|
275
|
+
"""
|
|
276
|
+
self._extraction_ops.append({"op": "remove", "name": name})
|
|
277
|
+
return self
|
|
278
|
+
|
|
279
|
+
def clear_extractions(self) -> "SFConfig":
|
|
280
|
+
"""Remove all extraction rules.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Self for method chaining.
|
|
284
|
+
"""
|
|
285
|
+
self._extraction_ops.append({"op": "clear"})
|
|
286
|
+
return self
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def extractions(self) -> List[Dict[str, Any]]:
|
|
290
|
+
"""List current extraction rules.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
List of extraction rule dictionaries.
|
|
294
|
+
"""
|
|
295
|
+
# Try to get from virtual field
|
|
296
|
+
extractions = self.get("mCustomExtractionConfig.extractions", [])
|
|
297
|
+
if extractions:
|
|
298
|
+
return extractions
|
|
299
|
+
|
|
300
|
+
# Fall back to parsing mFilters if available
|
|
301
|
+
return []
|
|
302
|
+
|
|
303
|
+
# ==================== Excludes ====================
|
|
304
|
+
|
|
305
|
+
def add_exclude(self, pattern: str) -> "SFConfig":
|
|
306
|
+
"""Add an exclude pattern (regex).
|
|
307
|
+
|
|
308
|
+
URLs matching this pattern will be excluded from crawling.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
pattern: Regex pattern to exclude.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Self for method chaining.
|
|
315
|
+
|
|
316
|
+
Example:
|
|
317
|
+
>>> config.add_exclude(r".*\\.pdf$") # Exclude PDFs
|
|
318
|
+
>>> config.add_exclude(r".*/admin/.*") # Exclude admin paths
|
|
319
|
+
"""
|
|
320
|
+
self._exclude_ops.append({"op": "append", "values": [pattern]})
|
|
321
|
+
return self
|
|
322
|
+
|
|
323
|
+
def remove_exclude(self, pattern: str) -> "SFConfig":
|
|
324
|
+
"""Remove an exclude pattern.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
pattern: The exact pattern to remove.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Self for method chaining.
|
|
331
|
+
"""
|
|
332
|
+
self._exclude_ops.append({"op": "remove", "values": [pattern]})
|
|
333
|
+
return self
|
|
334
|
+
|
|
335
|
+
def clear_excludes(self) -> "SFConfig":
|
|
336
|
+
"""Remove all exclude patterns.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Self for method chaining.
|
|
340
|
+
"""
|
|
341
|
+
self._exclude_ops.append({"op": "clear"})
|
|
342
|
+
return self
|
|
343
|
+
|
|
344
|
+
@property
|
|
345
|
+
def excludes(self) -> List[str]:
|
|
346
|
+
"""List current exclude patterns.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
List of regex patterns.
|
|
350
|
+
"""
|
|
351
|
+
return self.get("mExcludeManager.mExcludePatterns", [])
|
|
352
|
+
|
|
353
|
+
# ==================== Includes ====================
|
|
354
|
+
|
|
355
|
+
def add_include(self, pattern: str) -> "SFConfig":
|
|
356
|
+
"""Add an include pattern (regex).
|
|
357
|
+
|
|
358
|
+
Only URLs matching include patterns will be crawled.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
pattern: Regex pattern to include.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Self for method chaining.
|
|
365
|
+
"""
|
|
366
|
+
self._include_ops.append({"op": "append", "values": [pattern]})
|
|
367
|
+
return self
|
|
368
|
+
|
|
369
|
+
def remove_include(self, pattern: str) -> "SFConfig":
|
|
370
|
+
"""Remove an include pattern.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
pattern: The exact pattern to remove.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Self for method chaining.
|
|
377
|
+
"""
|
|
378
|
+
self._include_ops.append({"op": "remove", "values": [pattern]})
|
|
379
|
+
return self
|
|
380
|
+
|
|
381
|
+
def clear_includes(self) -> "SFConfig":
|
|
382
|
+
"""Remove all include patterns.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Self for method chaining.
|
|
386
|
+
"""
|
|
387
|
+
self._include_ops.append({"op": "clear"})
|
|
388
|
+
return self
|
|
389
|
+
|
|
390
|
+
@property
|
|
391
|
+
def includes(self) -> List[str]:
|
|
392
|
+
"""List current include patterns.
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
List of regex patterns.
|
|
396
|
+
"""
|
|
397
|
+
return self.get("mCrawlConfig.mIncludePatterns", [])
|
|
398
|
+
|
|
399
|
+
# ==================== Allowed Domains ====================
|
|
400
|
+
|
|
401
|
+
def add_allowed_domain(self, domain: str) -> "SFConfig":
|
|
402
|
+
"""Add an allowed domain for crawling.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
domain: Domain to allow (e.g., "example.com").
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Self for method chaining.
|
|
409
|
+
"""
|
|
410
|
+
# This typically maps to a specific SF config field
|
|
411
|
+
# Implementation depends on SF version
|
|
412
|
+
self.set("mCrawlConfig.mAllowedDomains",
|
|
413
|
+
self.get("mCrawlConfig.mAllowedDomains", []) + [domain])
|
|
414
|
+
return self
|
|
415
|
+
|
|
416
|
+
# ==================== Saving ====================
|
|
417
|
+
|
|
418
|
+
def save(self, output_path: Optional[str] = None) -> "SFConfig":
|
|
419
|
+
"""Save config to file.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
output_path: Path to save to. If None, overwrites the original file.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
Self for method chaining.
|
|
426
|
+
|
|
427
|
+
Raises:
|
|
428
|
+
SFConfigError: If no output path is specified and no original path exists.
|
|
429
|
+
SFValidationError: If patches contain invalid fields or values.
|
|
430
|
+
"""
|
|
431
|
+
output = output_path or self._path
|
|
432
|
+
if not output:
|
|
433
|
+
raise SFConfigError("No output path specified and no original path to overwrite")
|
|
434
|
+
|
|
435
|
+
# Build patches dict
|
|
436
|
+
patches = dict(self._patches)
|
|
437
|
+
|
|
438
|
+
if self._extraction_ops:
|
|
439
|
+
patches["extractions"] = self._extraction_ops
|
|
440
|
+
|
|
441
|
+
if self._exclude_ops:
|
|
442
|
+
# Convert ops to the format expected by Java CLI
|
|
443
|
+
if len(self._exclude_ops) == 1 and self._exclude_ops[0].get("op") == "clear":
|
|
444
|
+
patches["mExcludeManager.mExcludePatterns"] = {"op": "clear"}
|
|
445
|
+
else:
|
|
446
|
+
# Combine all ops
|
|
447
|
+
combined_op = {"op": "append", "values": []}
|
|
448
|
+
for op in self._exclude_ops:
|
|
449
|
+
if op.get("op") == "append":
|
|
450
|
+
combined_op["values"].extend(op.get("values", []))
|
|
451
|
+
elif op.get("op") == "remove":
|
|
452
|
+
combined_op = {"op": "remove", "values": op.get("values", [])}
|
|
453
|
+
elif op.get("op") == "clear":
|
|
454
|
+
combined_op = {"op": "clear"}
|
|
455
|
+
patches["mExcludeManager.mExcludePatterns"] = combined_op
|
|
456
|
+
|
|
457
|
+
if self._include_ops:
|
|
458
|
+
if len(self._include_ops) == 1 and self._include_ops[0].get("op") == "clear":
|
|
459
|
+
patches["mCrawlConfig.mIncludePatterns"] = {"op": "clear"}
|
|
460
|
+
else:
|
|
461
|
+
combined_op = {"op": "append", "values": []}
|
|
462
|
+
for op in self._include_ops:
|
|
463
|
+
if op.get("op") == "append":
|
|
464
|
+
combined_op["values"].extend(op.get("values", []))
|
|
465
|
+
elif op.get("op") == "remove":
|
|
466
|
+
combined_op = {"op": "remove", "values": op.get("values", [])}
|
|
467
|
+
elif op.get("op") == "clear":
|
|
468
|
+
combined_op = {"op": "clear"}
|
|
469
|
+
patches["mCrawlConfig.mIncludePatterns"] = combined_op
|
|
470
|
+
|
|
471
|
+
patches_json = json.dumps(patches)
|
|
472
|
+
|
|
473
|
+
self._run_java(
|
|
474
|
+
"--build",
|
|
475
|
+
"--template", self._path,
|
|
476
|
+
"--output", str(output),
|
|
477
|
+
"--patches", patches_json,
|
|
478
|
+
sf_path=self._sf_path,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Update state
|
|
482
|
+
self._path = str(output)
|
|
483
|
+
self._patches = {}
|
|
484
|
+
self._extraction_ops = []
|
|
485
|
+
self._exclude_ops = []
|
|
486
|
+
self._include_ops = []
|
|
487
|
+
|
|
488
|
+
# Reload to get fresh data
|
|
489
|
+
result = self._run_java("--inspect", "--config", str(output), sf_path=self._sf_path)
|
|
490
|
+
self._data = result
|
|
491
|
+
|
|
492
|
+
return self
|
|
493
|
+
|
|
494
|
+
def preview_save(self) -> List[Dict[str, Any]]:
|
|
495
|
+
"""Preview changes without saving.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
List of change dictionaries showing what would be modified.
|
|
499
|
+
"""
|
|
500
|
+
patches = dict(self._patches)
|
|
501
|
+
if self._extraction_ops:
|
|
502
|
+
patches["extractions"] = self._extraction_ops
|
|
503
|
+
if self._exclude_ops:
|
|
504
|
+
patches["mExcludeManager.mExcludePatterns"] = self._exclude_ops
|
|
505
|
+
if self._include_ops:
|
|
506
|
+
patches["mCrawlConfig.mIncludePatterns"] = self._include_ops
|
|
507
|
+
|
|
508
|
+
patches_json = json.dumps(patches)
|
|
509
|
+
|
|
510
|
+
# Use NUL on Windows, /dev/null on Unix
|
|
511
|
+
import platform
|
|
512
|
+
null_path = "NUL" if platform.system() == "Windows" else "/dev/null"
|
|
513
|
+
|
|
514
|
+
result = self._run_java(
|
|
515
|
+
"--build",
|
|
516
|
+
"--template", self._path,
|
|
517
|
+
"--output", null_path,
|
|
518
|
+
"--patches", patches_json,
|
|
519
|
+
"--dry-run",
|
|
520
|
+
sf_path=self._sf_path,
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
return result.get("changes", [])
|
|
524
|
+
|
|
525
|
+
# ==================== Crawling ====================
|
|
526
|
+
|
|
527
|
+
def run_crawl(
|
|
528
|
+
self,
|
|
529
|
+
url: str,
|
|
530
|
+
output_folder: str,
|
|
531
|
+
export_tabs: Optional[List[str]] = None,
|
|
532
|
+
export_format: str = "csv",
|
|
533
|
+
timeout: Optional[int] = None,
|
|
534
|
+
) -> None:
|
|
535
|
+
"""Run a crawl (blocking).
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
url: The URL to start crawling from.
|
|
539
|
+
output_folder: Directory to save crawl results.
|
|
540
|
+
export_tabs: List of tabs to export (e.g., ["Internal:All", "Response Codes:All"]).
|
|
541
|
+
export_format: Export format - "csv" or "xlsx".
|
|
542
|
+
timeout: Maximum time in seconds to wait for crawl completion.
|
|
543
|
+
|
|
544
|
+
Raises:
|
|
545
|
+
SFCrawlError: If the crawl fails or times out.
|
|
546
|
+
SFConfigError: If the config hasn't been saved yet.
|
|
547
|
+
"""
|
|
548
|
+
process = self.run_crawl_async(url, output_folder, export_tabs, export_format)
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
process.wait(timeout=timeout)
|
|
552
|
+
except subprocess.TimeoutExpired:
|
|
553
|
+
process.kill()
|
|
554
|
+
raise SFCrawlError(f"Crawl timed out after {timeout} seconds")
|
|
555
|
+
|
|
556
|
+
if process.returncode != 0:
|
|
557
|
+
raise SFCrawlError(f"Crawl failed with exit code {process.returncode}")
|
|
558
|
+
|
|
559
|
+
def run_crawl_async(
|
|
560
|
+
self,
|
|
561
|
+
url: str,
|
|
562
|
+
output_folder: str,
|
|
563
|
+
export_tabs: Optional[List[str]] = None,
|
|
564
|
+
export_format: str = "csv",
|
|
565
|
+
) -> subprocess.Popen:
|
|
566
|
+
"""Run a crawl (non-blocking).
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
url: The URL to start crawling from.
|
|
570
|
+
output_folder: Directory to save crawl results.
|
|
571
|
+
export_tabs: List of tabs to export.
|
|
572
|
+
export_format: Export format - "csv" or "xlsx".
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
subprocess.Popen handle for the crawl process.
|
|
576
|
+
|
|
577
|
+
Raises:
|
|
578
|
+
SFConfigError: If the config hasn't been saved yet.
|
|
579
|
+
"""
|
|
580
|
+
if not self._path:
|
|
581
|
+
raise SFConfigError("Save config before running crawl")
|
|
582
|
+
|
|
583
|
+
cli = get_sf_cli_path()
|
|
584
|
+
|
|
585
|
+
cmd = [
|
|
586
|
+
cli,
|
|
587
|
+
"--crawl", url,
|
|
588
|
+
"--config", self._path,
|
|
589
|
+
"--headless",
|
|
590
|
+
"--output-folder", str(output_folder),
|
|
591
|
+
"--export-format", export_format,
|
|
592
|
+
]
|
|
593
|
+
|
|
594
|
+
if export_tabs:
|
|
595
|
+
cmd.extend(["--export-tabs", ",".join(export_tabs)])
|
|
596
|
+
|
|
597
|
+
return subprocess.Popen(cmd)
|
|
598
|
+
|
|
599
|
+
# ==================== Test Extraction ====================
|
|
600
|
+
|
|
601
|
+
def test_extraction(
|
|
602
|
+
self,
|
|
603
|
+
url: str,
|
|
604
|
+
extraction_name: Optional[str] = None,
|
|
605
|
+
selector: Optional[str] = None,
|
|
606
|
+
selector_type: str = "XPATH",
|
|
607
|
+
extract_mode: str = "TEXT",
|
|
608
|
+
render_js: bool = False,
|
|
609
|
+
) -> Dict[str, Any]:
|
|
610
|
+
"""Test an extraction against a live URL.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
url: URL to fetch and test against.
|
|
614
|
+
extraction_name: Name of an existing extraction rule to test.
|
|
615
|
+
selector: Inline selector to test (alternative to extraction_name).
|
|
616
|
+
selector_type: Type of selector - "XPATH", "CSS", or "REGEX".
|
|
617
|
+
extract_mode: What to extract - "TEXT", "HTML_ELEMENT", etc.
|
|
618
|
+
render_js: Whether to render JavaScript before extraction.
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
Dictionary containing:
|
|
622
|
+
- success: Whether the test succeeded
|
|
623
|
+
- matches: List of matched values
|
|
624
|
+
- match_count: Number of matches
|
|
625
|
+
- warnings: Any warnings
|
|
626
|
+
|
|
627
|
+
Raises:
|
|
628
|
+
SFValidationError: If neither extraction_name nor selector is provided.
|
|
629
|
+
"""
|
|
630
|
+
if extraction_name:
|
|
631
|
+
# Find extraction in config
|
|
632
|
+
for ext in self.extractions:
|
|
633
|
+
if ext.get("name") == extraction_name:
|
|
634
|
+
selector = ext.get("selector")
|
|
635
|
+
selector_type = ext.get("selectorType", "XPATH")
|
|
636
|
+
extract_mode = ext.get("extractMode", "TEXT")
|
|
637
|
+
break
|
|
638
|
+
else:
|
|
639
|
+
raise SFValidationError(f"Extraction '{extraction_name}' not found")
|
|
640
|
+
|
|
641
|
+
if not selector:
|
|
642
|
+
raise SFValidationError("Provide extraction_name or selector")
|
|
643
|
+
|
|
644
|
+
args = [
|
|
645
|
+
"--test-extraction",
|
|
646
|
+
"--url", url,
|
|
647
|
+
"--selector", selector,
|
|
648
|
+
"--selector-type", selector_type.upper(),
|
|
649
|
+
"--extract-mode", extract_mode.upper(),
|
|
650
|
+
]
|
|
651
|
+
|
|
652
|
+
if render_js:
|
|
653
|
+
args.append("--render-js")
|
|
654
|
+
|
|
655
|
+
result = self._run_java(*args, sf_path=self._sf_path)
|
|
656
|
+
return result
|
|
657
|
+
|
|
658
|
+
# ==================== Diff ====================
|
|
659
|
+
|
|
660
|
+
@classmethod
|
|
661
|
+
def diff(
|
|
662
|
+
cls,
|
|
663
|
+
config_a: Union[str, "SFConfig"],
|
|
664
|
+
config_b: Union[str, "SFConfig"],
|
|
665
|
+
prefix: Optional[str] = None,
|
|
666
|
+
sf_path: Optional[str] = None,
|
|
667
|
+
) -> SFDiff:
|
|
668
|
+
"""Compare two configs.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
config_a: First config (path or SFConfig instance).
|
|
672
|
+
config_b: Second config (path or SFConfig instance).
|
|
673
|
+
prefix: Optional path prefix to filter differences.
|
|
674
|
+
sf_path: Optional custom path to SF installation directory.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
SFDiff object representing the differences.
|
|
678
|
+
|
|
679
|
+
Example:
|
|
680
|
+
>>> diff = SFConfig.diff("old.seospiderconfig", "new.seospiderconfig")
|
|
681
|
+
>>> if diff.has_changes:
|
|
682
|
+
... print(diff)
|
|
683
|
+
"""
|
|
684
|
+
path_a = config_a._path if isinstance(config_a, SFConfig) else str(config_a)
|
|
685
|
+
path_b = config_b._path if isinstance(config_b, SFConfig) else str(config_b)
|
|
686
|
+
|
|
687
|
+
# Get sf_path from config if not provided
|
|
688
|
+
if sf_path is None and isinstance(config_a, SFConfig):
|
|
689
|
+
sf_path = config_a._sf_path
|
|
690
|
+
|
|
691
|
+
args = ["--diff", "--config-a", path_a, "--config-b", path_b]
|
|
692
|
+
if prefix:
|
|
693
|
+
args.extend(["--prefix", prefix])
|
|
694
|
+
|
|
695
|
+
result = cls._run_java(*args, sf_path=sf_path)
|
|
696
|
+
return SFDiff(result)
|
|
697
|
+
|
|
698
|
+
# ==================== Internal ====================
|
|
699
|
+
|
|
700
|
+
@classmethod
|
|
701
|
+
def _run_java(cls, *args: str, sf_path: Optional[str] = None) -> Dict[str, Any]:
|
|
702
|
+
"""Execute Java CLI and return parsed JSON result.
|
|
703
|
+
|
|
704
|
+
Args:
|
|
705
|
+
*args: Command line arguments to pass to the Java CLI.
|
|
706
|
+
sf_path: Optional custom path to SF installation directory.
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
Parsed JSON response from the CLI.
|
|
710
|
+
|
|
711
|
+
Raises:
|
|
712
|
+
SFParseError: If the CLI output is not valid JSON.
|
|
713
|
+
SFValidationError: If the CLI returns a validation error.
|
|
714
|
+
SFConfigError: If the CLI returns any other error.
|
|
715
|
+
"""
|
|
716
|
+
java = get_java_path(sf_path)
|
|
717
|
+
sf_jar_path = get_sf_jar_path(sf_path)
|
|
718
|
+
cp_sep = get_classpath_separator()
|
|
719
|
+
|
|
720
|
+
# Build classpath
|
|
721
|
+
classpath = f"{cls.JAR_PATH}{cp_sep}{sf_jar_path}/*"
|
|
722
|
+
|
|
723
|
+
cmd = [java, "-cp", classpath, "ConfigBuilder", *args]
|
|
724
|
+
|
|
725
|
+
result = subprocess.run(
|
|
726
|
+
cmd,
|
|
727
|
+
capture_output=True,
|
|
728
|
+
text=True,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Handle empty output
|
|
732
|
+
if not result.stdout.strip():
|
|
733
|
+
if result.stderr:
|
|
734
|
+
raise SFConfigError(f"Java CLI error: {result.stderr}")
|
|
735
|
+
raise SFParseError("No output from Java CLI")
|
|
736
|
+
|
|
737
|
+
try:
|
|
738
|
+
data = json.loads(result.stdout)
|
|
739
|
+
except json.JSONDecodeError as e:
|
|
740
|
+
raise SFParseError(
|
|
741
|
+
f"Invalid JSON from CLI: {result.stdout[:200]}...\n"
|
|
742
|
+
f"Parse error: {e}"
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
if not data.get("success", True):
|
|
746
|
+
error_type = data.get("errorType", "UNKNOWN")
|
|
747
|
+
error_msg = data.get("error", "Unknown error")
|
|
748
|
+
details = data.get("details", {})
|
|
749
|
+
|
|
750
|
+
if error_type == "VALIDATION_ERROR":
|
|
751
|
+
raise SFValidationError(f"{error_msg}: {details}" if details else error_msg)
|
|
752
|
+
elif error_type == "PARSE_ERROR":
|
|
753
|
+
raise SFParseError(error_msg)
|
|
754
|
+
elif error_type == "IO_ERROR":
|
|
755
|
+
raise SFConfigError(f"I/O error: {error_msg}")
|
|
756
|
+
else:
|
|
757
|
+
raise SFConfigError(error_msg)
|
|
758
|
+
|
|
759
|
+
return data
|
|
760
|
+
|
|
761
|
+
def __repr__(self) -> str:
|
|
762
|
+
"""Return developer-friendly representation."""
|
|
763
|
+
return f"<SFConfig path={self._path!r} version={self.config_version}>"
|
|
764
|
+
|
|
765
|
+
def __str__(self) -> str:
|
|
766
|
+
"""Return human-readable string representation."""
|
|
767
|
+
return f"SFConfig({self._path or 'unsaved'})"
|