sf-config-builder 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sf_config_builder-0.1.1.dist-info/METADATA +316 -0
- sf_config_builder-0.1.1.dist-info/RECORD +10 -0
- sf_config_builder-0.1.1.dist-info/WHEEL +5 -0
- sf_config_builder-0.1.1.dist-info/licenses/LICENSE +21 -0
- sf_config_builder-0.1.1.dist-info/top_level.txt +1 -0
- sfconfig/__init__.py +34 -0
- sfconfig/config.py +767 -0
- sfconfig/diff.py +145 -0
- sfconfig/exceptions.py +26 -0
- sfconfig/paths.py +217 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sf-config-builder
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Manage Screaming Frog configs programmatically
|
|
5
|
+
Author: Antonio
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Amaculus/sf-config-builder
|
|
8
|
+
Project-URL: Documentation, https://github.com/Amaculus/sf-config-builder#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/Amaculus/sf-config-builder
|
|
10
|
+
Project-URL: Issues, https://github.com/Amaculus/sf-config-builder/issues
|
|
11
|
+
Keywords: seo,screaming-frog,crawling,automation,config
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# sf-config-tool
|
|
33
|
+
|
|
34
|
+
Manage Screaming Frog `.seospiderconfig` files programmatically.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install sf-config-tool
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Requirements
|
|
43
|
+
|
|
44
|
+
- **Screaming Frog SEO Spider** must be installed (provides JARs for deserialization)
|
|
45
|
+
- Python 3.8+
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from sfconfig import SFConfig
|
|
51
|
+
|
|
52
|
+
# Load existing config
|
|
53
|
+
config = SFConfig.load("base.seospiderconfig")
|
|
54
|
+
|
|
55
|
+
# Configure for e-commerce audit
|
|
56
|
+
config.max_urls = 100000
|
|
57
|
+
config.rendering_mode = "JAVASCRIPT"
|
|
58
|
+
|
|
59
|
+
# Add custom extractions
|
|
60
|
+
config.add_extraction("Price", "//span[@class='price']")
|
|
61
|
+
config.add_extraction("SKU", "//span[@itemprop='sku']")
|
|
62
|
+
config.add_extraction("Stock", ".availability", selector_type="CSS")
|
|
63
|
+
|
|
64
|
+
# Add exclude patterns
|
|
65
|
+
config.add_exclude(r".*\.pdf$")
|
|
66
|
+
config.add_exclude(r".*/admin/.*")
|
|
67
|
+
|
|
68
|
+
# Save and run
|
|
69
|
+
config.save("client-audit.seospiderconfig")
|
|
70
|
+
config.run_crawl("https://example.com", output_folder="./results")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
### Inspect Configs
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
config = SFConfig.load("my.seospiderconfig")
|
|
79
|
+
|
|
80
|
+
# Get specific field
|
|
81
|
+
max_urls = config.get("mCrawlConfig.mMaxUrls")
|
|
82
|
+
|
|
83
|
+
# List all fields
|
|
84
|
+
for field in config.fields():
|
|
85
|
+
print(f"{field['path']}: {field['value']}")
|
|
86
|
+
|
|
87
|
+
# Filter by prefix
|
|
88
|
+
crawl_fields = config.fields(prefix="mCrawlConfig")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Modify Configs
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
# Direct field access
|
|
95
|
+
config.set("mCrawlConfig.mMaxUrls", 100000)
|
|
96
|
+
|
|
97
|
+
# Convenience properties
|
|
98
|
+
config.max_urls = 100000
|
|
99
|
+
config.max_depth = 10
|
|
100
|
+
config.rendering_mode = "JAVASCRIPT" # STATIC | JAVASCRIPT
|
|
101
|
+
config.robots_mode = "IGNORE" # RESPECT | IGNORE
|
|
102
|
+
config.crawl_delay = 0.5
|
|
103
|
+
config.user_agent = "MyBot/1.0"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Custom Extractions
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# Add extraction rules
|
|
110
|
+
config.add_extraction(
|
|
111
|
+
name="Price",
|
|
112
|
+
selector="//span[@class='price']",
|
|
113
|
+
selector_type="XPATH", # XPATH | CSS | REGEX
|
|
114
|
+
extract_mode="TEXT" # TEXT | HTML_ELEMENT | INNER_HTML
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# List extractions
|
|
118
|
+
for ext in config.extractions:
|
|
119
|
+
print(f"{ext['name']}: {ext['selector']}")
|
|
120
|
+
|
|
121
|
+
# Remove by name
|
|
122
|
+
config.remove_extraction("Price")
|
|
123
|
+
|
|
124
|
+
# Clear all
|
|
125
|
+
config.clear_extractions()
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Exclude/Include Patterns
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
# Excludes (URLs matching these patterns are skipped)
|
|
132
|
+
config.add_exclude(r".*\.pdf$")
|
|
133
|
+
config.add_exclude(r".*/admin/.*")
|
|
134
|
+
|
|
135
|
+
# Includes (only URLs matching these are crawled)
|
|
136
|
+
config.add_include(r".*/products/.*")
|
|
137
|
+
|
|
138
|
+
# List patterns
|
|
139
|
+
print(config.excludes)
|
|
140
|
+
print(config.includes)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Compare Configs
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from sfconfig import SFConfig
|
|
147
|
+
|
|
148
|
+
diff = SFConfig.diff("old.seospiderconfig", "new.seospiderconfig")
|
|
149
|
+
|
|
150
|
+
if diff.has_changes:
|
|
151
|
+
print(f"Found {diff.change_count} differences:")
|
|
152
|
+
print(diff)
|
|
153
|
+
|
|
154
|
+
# Filter by prefix
|
|
155
|
+
crawl_changes = diff.changes_for("mCrawlConfig")
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Test Extractions
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
# Test selector against live URL before full crawl
|
|
162
|
+
result = config.test_extraction(
|
|
163
|
+
url="https://example.com/product",
|
|
164
|
+
selector="//span[@class='price']",
|
|
165
|
+
selector_type="XPATH"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if result["match_count"] > 0:
|
|
169
|
+
print(f"Found: {result['matches']}")
|
|
170
|
+
else:
|
|
171
|
+
print("Selector didn't match - fix before crawling")
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Run Crawls
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
# Blocking crawl
|
|
178
|
+
config.run_crawl(
|
|
179
|
+
url="https://example.com",
|
|
180
|
+
output_folder="./results",
|
|
181
|
+
export_tabs=["Internal:All", "Response Codes:All"],
|
|
182
|
+
export_format="csv",
|
|
183
|
+
timeout=3600
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Async crawl
|
|
187
|
+
process = config.run_crawl_async(
|
|
188
|
+
url="https://example.com",
|
|
189
|
+
output_folder="./results"
|
|
190
|
+
)
|
|
191
|
+
# Do other work...
|
|
192
|
+
process.wait() # Block until complete
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Multi-Client Workflow
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from sfconfig import SFConfig
|
|
199
|
+
|
|
200
|
+
clients = [
|
|
201
|
+
{"domain": "client1.com", "max_urls": 50000},
|
|
202
|
+
{"domain": "client2.com", "max_urls": 100000},
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
for client in clients:
|
|
206
|
+
config = SFConfig.load("agency-base.seospiderconfig")
|
|
207
|
+
config.max_urls = client["max_urls"]
|
|
208
|
+
config.add_extraction("Price", "//span[@class='price']")
|
|
209
|
+
|
|
210
|
+
config.save(f"/tmp/{client['domain']}.seospiderconfig")
|
|
211
|
+
config.run_crawl(
|
|
212
|
+
url=f"https://{client['domain']}",
|
|
213
|
+
output_folder=f"./results/{client['domain']}"
|
|
214
|
+
)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Error Handling
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from sfconfig import (
|
|
221
|
+
SFConfig,
|
|
222
|
+
SFNotFoundError,
|
|
223
|
+
SFValidationError,
|
|
224
|
+
SFParseError,
|
|
225
|
+
SFCrawlError
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
config = SFConfig.load("my.seospiderconfig")
|
|
230
|
+
config.set("mInvalidField", 123)
|
|
231
|
+
config.save()
|
|
232
|
+
except SFNotFoundError:
|
|
233
|
+
print("Install Screaming Frog first")
|
|
234
|
+
except SFValidationError as e:
|
|
235
|
+
print(f"Invalid field: {e}")
|
|
236
|
+
except SFParseError as e:
|
|
237
|
+
print(f"Could not parse config: {e}")
|
|
238
|
+
except SFCrawlError as e:
|
|
239
|
+
print(f"Crawl failed: {e}")
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## Environment Variables
|
|
243
|
+
|
|
244
|
+
| Variable | Description |
|
|
245
|
+
|----------|-------------|
|
|
246
|
+
| `SF_PATH` | Custom path to SF's JAR directory |
|
|
247
|
+
| `SF_CLI_PATH` | Custom path to SF CLI executable |
|
|
248
|
+
| `JAVA_HOME` | Custom Java installation path |
|
|
249
|
+
|
|
250
|
+
## Architecture
|
|
251
|
+
|
|
252
|
+
```
|
|
253
|
+
User Python code
|
|
254
|
+
|
|
|
255
|
+
v
|
|
256
|
+
+------------------+
|
|
257
|
+
| sfconfig | (Python wrapper)
|
|
258
|
+
| - SFConfig |
|
|
259
|
+
| - SFDiff |
|
|
260
|
+
+--------+---------+
|
|
261
|
+
| subprocess.run()
|
|
262
|
+
v
|
|
263
|
+
+------------------+
|
|
264
|
+
| ConfigBuilder | (Java CLI, bundled ~50KB)
|
|
265
|
+
| .jar |
|
|
266
|
+
+--------+---------+
|
|
267
|
+
| classpath includes
|
|
268
|
+
v
|
|
269
|
+
+------------------+
|
|
270
|
+
| SF's JARs | (from user's local SF install, NOT bundled)
|
|
271
|
+
+------------------+
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
At runtime, the library builds a classpath combining:
|
|
275
|
+
- `ConfigBuilder.jar` (bundled with this package)
|
|
276
|
+
- `{SF_INSTALL_PATH}/*` (user's local Screaming Frog JARs)
|
|
277
|
+
|
|
278
|
+
This means:
|
|
279
|
+
- Only our small JAR is distributed (no licensing issues)
|
|
280
|
+
- SF's proprietary JARs are used from the user's existing installation
|
|
281
|
+
- Compatibility is maintained across SF versions
|
|
282
|
+
|
|
283
|
+
## Development
|
|
284
|
+
|
|
285
|
+
### Building the Java CLI
|
|
286
|
+
|
|
287
|
+
The Java CLI lives in a separate repo (`sf-config-builder`). To build:
|
|
288
|
+
|
|
289
|
+
```bash
|
|
290
|
+
cd /path/to/sf-config-builder
|
|
291
|
+
|
|
292
|
+
# Compile against SF's JARs (as compile-time dependency)
|
|
293
|
+
javac -cp "C:/Program Files/Screaming Frog SEO Spider/*" \
|
|
294
|
+
-d bin src/ConfigBuilder.java
|
|
295
|
+
|
|
296
|
+
# Package into JAR
|
|
297
|
+
cd bin
|
|
298
|
+
jar cfe ConfigBuilder.jar ConfigBuilder *.class
|
|
299
|
+
|
|
300
|
+
# Copy to Python package
|
|
301
|
+
cp ConfigBuilder.jar /path/to/sf-config-tool/sfconfig/java/
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
**Important**: Only bundle `ConfigBuilder.jar`. Do NOT bundle any JARs from SF's install directory - those are proprietary and already on the user's machine.
|
|
305
|
+
|
|
306
|
+
### Installing for Development
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
cd sf-config-tool
|
|
310
|
+
pip install -e ".[dev]"
|
|
311
|
+
pytest tests/
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
## License
|
|
315
|
+
|
|
316
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
sf_config_builder-0.1.1.dist-info/licenses/LICENSE,sha256=gY8ByiA5FLdBA_UPHEij4N2I3ACFJK_axSAGB8lvhxA,1064
|
|
2
|
+
sfconfig/__init__.py,sha256=XsFEy2jxz61Ub5pUVfjoouIlJd5sfY8QrtofAbJrSTQ,902
|
|
3
|
+
sfconfig/config.py,sha256=zTDPBN8yRxb4aboWIv-lTKZPajquMff9J_DOymZH_fQ,25169
|
|
4
|
+
sfconfig/diff.py,sha256=p3gh-wcalAeR8UwZmpbGgkZVRMkwBKpdMMhIQIQUqsg,4597
|
|
5
|
+
sfconfig/exceptions.py,sha256=AsMRv9WZgpoxo3BZCVmk6WyZ4dULL7ojL79BHjfp0zY,492
|
|
6
|
+
sfconfig/paths.py,sha256=kOdnoA3IYIAUTnurcGlDoyfhns4a8J73PE5pYDnLBmE,6502
|
|
7
|
+
sf_config_builder-0.1.1.dist-info/METADATA,sha256=fkYwSe_ZJPbf4-lEEfQQJKNkA7eDoi_2U5iggAivLP0,7965
|
|
8
|
+
sf_config_builder-0.1.1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
9
|
+
sf_config_builder-0.1.1.dist-info/top_level.txt,sha256=-vINvIw8-ocnRNzJE8L7cn0qTGjRooBvTPHF4I1An2M,9
|
|
10
|
+
sf_config_builder-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Antonio
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sfconfig
|
sfconfig/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""sf-config-tool: Manage Screaming Frog configs programmatically.
|
|
2
|
+
|
|
3
|
+
This library provides a Python interface for managing Screaming Frog
|
|
4
|
+
.seospiderconfig files, enabling inspection, diffing, modification,
|
|
5
|
+
and crawl execution without using the SF GUI.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> from sfconfig import SFConfig
|
|
9
|
+
>>> config = SFConfig.load("base.seospiderconfig")
|
|
10
|
+
>>> config.add_extraction("Price", "//span[@class='price']")
|
|
11
|
+
>>> config.save("client.seospiderconfig")
|
|
12
|
+
>>> config.run_crawl("https://example.com", output_folder="./results")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .config import SFConfig
|
|
16
|
+
from .diff import SFDiff
|
|
17
|
+
from .exceptions import (
|
|
18
|
+
SFConfigError,
|
|
19
|
+
SFCrawlError,
|
|
20
|
+
SFNotFoundError,
|
|
21
|
+
SFParseError,
|
|
22
|
+
SFValidationError,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.1"
|
|
26
|
+
__all__ = [
|
|
27
|
+
"SFConfig",
|
|
28
|
+
"SFDiff",
|
|
29
|
+
"SFConfigError",
|
|
30
|
+
"SFNotFoundError",
|
|
31
|
+
"SFValidationError",
|
|
32
|
+
"SFParseError",
|
|
33
|
+
"SFCrawlError",
|
|
34
|
+
]
|