ssc_codegen 0.17.1__tar.gz → 0.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/.gitignore +147 -147
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/LICENSE +20 -20
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/PKG-INFO +2 -2
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/README.md +124 -124
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/pyproject.toml +112 -112
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/__init__.py +27 -27
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/_logging.py +102 -102
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/__init__.py +175 -174
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/array.py +56 -56
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/base.py +24 -24
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/cast.py +72 -72
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/control.py +82 -82
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/extract.py +51 -51
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/helpers.py +7 -7
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/jsondef.py +37 -37
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/module.py +92 -92
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/predicate_containers.py +44 -44
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/predicate_ops.py +309 -309
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/regex.py +57 -57
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/selectors.py +55 -55
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/string.py +158 -158
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/struct.py +283 -232
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/transform.py +63 -63
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/typedef.py +41 -41
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/ast/types.py +79 -79
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/base.py +331 -327
- ssc_codegen-0.18.0/ssc_codegen/converters/go_goquery.py +1872 -0
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/helpers.py +76 -76
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/js_pure.py +1610 -1445
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/py_bs4.py +1509 -1384
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/py_lxml.py +548 -538
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/py_parsel.py +478 -475
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/converters/py_slax.py +433 -428
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/document_utils.py +107 -107
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/exceptions.py +11 -11
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/health.py +426 -418
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/kdl/__init__.py +35 -35
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/kdl/parser.py +1366 -954
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/__init__.py +26 -26
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/_kdl_lang.py +435 -415
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/base.py +822 -818
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/errors.py +78 -78
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/format_errors.py +285 -285
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/metadata.py +18 -18
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/navigation.py +262 -262
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/path.py +36 -36
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/rule_keywords.py +321 -320
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/rules.py +848 -848
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/rules_struct.py +611 -571
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/type_rules.py +523 -523
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/linter/types.py +188 -188
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/main.py +668 -616
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/parser.py +2357 -2298
- ssc_codegen-0.18.0/ssc_codegen/parsers/__init__.py +23 -0
- ssc_codegen-0.18.0/ssc_codegen/parsers/curl.py +240 -0
- ssc_codegen-0.18.0/ssc_codegen/parsers/http.py +218 -0
- ssc_codegen-0.18.0/ssc_codegen/parsers/spec.py +327 -0
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/pseudo_selectors.py +39 -39
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/regex_utils.py +204 -204
- {ssc_codegen-0.17.1 → ssc_codegen-0.18.0}/ssc_codegen/selector_utils.py +8 -8
|
@@ -1,147 +1,147 @@
|
|
|
1
|
-
# Byte-compiled / optimized / DLL files
|
|
2
|
-
__pycache__/
|
|
3
|
-
*.py[cod]
|
|
4
|
-
*$py.class
|
|
5
|
-
|
|
6
|
-
# C extensions / native shared libs
|
|
7
|
-
*.so
|
|
8
|
-
*.dll
|
|
9
|
-
*.dylib
|
|
10
|
-
*.exp
|
|
11
|
-
*.lib
|
|
12
|
-
|
|
13
|
-
# Distribution / packaging
|
|
14
|
-
.Python
|
|
15
|
-
build/
|
|
16
|
-
develop-eggs/
|
|
17
|
-
dist/
|
|
18
|
-
downloads/
|
|
19
|
-
eggs/
|
|
20
|
-
.eggs/
|
|
21
|
-
lib/
|
|
22
|
-
lib64/
|
|
23
|
-
parts/
|
|
24
|
-
sdist/
|
|
25
|
-
var/
|
|
26
|
-
wheels/
|
|
27
|
-
pip-wheel-metadata/
|
|
28
|
-
share/python-wheels/
|
|
29
|
-
.ruff_cache/
|
|
30
|
-
*.egg-info/
|
|
31
|
-
.installed.cfg
|
|
32
|
-
*.egg
|
|
33
|
-
MANIFEST
|
|
34
|
-
|
|
35
|
-
# PyInstaller
|
|
36
|
-
# Usually these files are written by a python script from a template
|
|
37
|
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
38
|
-
*.manifest
|
|
39
|
-
*.spec
|
|
40
|
-
|
|
41
|
-
# Installer logs
|
|
42
|
-
pip-log.txt
|
|
43
|
-
pip-delete-this-directory.txt
|
|
44
|
-
|
|
45
|
-
# Unit tests / coverage reports
|
|
46
|
-
htmlcov/
|
|
47
|
-
.tox/
|
|
48
|
-
.nox/
|
|
49
|
-
.coverage
|
|
50
|
-
.coverage.*
|
|
51
|
-
.cache
|
|
52
|
-
nosetests.xml
|
|
53
|
-
coverage.xml
|
|
54
|
-
*.cover
|
|
55
|
-
*.py,cover
|
|
56
|
-
.hypothesis/
|
|
57
|
-
.pytest_cache/
|
|
58
|
-
|
|
59
|
-
# Translations
|
|
60
|
-
*.mo
|
|
61
|
-
*.pot
|
|
62
|
-
|
|
63
|
-
# Django stuff:
|
|
64
|
-
*.log
|
|
65
|
-
local_settings.py
|
|
66
|
-
db.sqlite3
|
|
67
|
-
db.sqlite3-journal
|
|
68
|
-
|
|
69
|
-
# Flask stuff:
|
|
70
|
-
instance/
|
|
71
|
-
.webassets-cache
|
|
72
|
-
|
|
73
|
-
# Scrapy stuff:
|
|
74
|
-
.scrapy
|
|
75
|
-
|
|
76
|
-
# Sphinx documentation
|
|
77
|
-
docs/source/_build/
|
|
78
|
-
|
|
79
|
-
# PyBuilder
|
|
80
|
-
target/
|
|
81
|
-
|
|
82
|
-
# Jupyter Notebook
|
|
83
|
-
.ipynb_checkpoints
|
|
84
|
-
|
|
85
|
-
# IPython
|
|
86
|
-
profile_default/
|
|
87
|
-
ipython_config.py
|
|
88
|
-
|
|
89
|
-
# pyenv
|
|
90
|
-
.python-version
|
|
91
|
-
|
|
92
|
-
# pipenv
|
|
93
|
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
94
|
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
95
|
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
96
|
-
# install all needed dependencies.
|
|
97
|
-
#Pipfile.lock
|
|
98
|
-
|
|
99
|
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
100
|
-
__pypackages__/
|
|
101
|
-
|
|
102
|
-
# Celery stuff
|
|
103
|
-
celerybeat-schedule
|
|
104
|
-
celerybeat.pid
|
|
105
|
-
|
|
106
|
-
# SageMath parsed files
|
|
107
|
-
*.sage.py
|
|
108
|
-
|
|
109
|
-
# Environments
|
|
110
|
-
.env
|
|
111
|
-
.venv
|
|
112
|
-
env/
|
|
113
|
-
venv/
|
|
114
|
-
ENV/
|
|
115
|
-
env.bak/
|
|
116
|
-
venv.bak/
|
|
117
|
-
|
|
118
|
-
# Spyder project settings
|
|
119
|
-
.spyderproject
|
|
120
|
-
.spyproject
|
|
121
|
-
|
|
122
|
-
# Rope project settings
|
|
123
|
-
.ropeproject
|
|
124
|
-
|
|
125
|
-
# mkdocs documentation
|
|
126
|
-
/site
|
|
127
|
-
|
|
128
|
-
# mypy
|
|
129
|
-
.mypy_cache/
|
|
130
|
-
.dmypy.json
|
|
131
|
-
dmypy.json
|
|
132
|
-
|
|
133
|
-
# Pyre type checker
|
|
134
|
-
.pyre/
|
|
135
|
-
|
|
136
|
-
# project
|
|
137
|
-
test_schemas/
|
|
138
|
-
.idea/.gitignore
|
|
139
|
-
.idea/.name
|
|
140
|
-
.idea/misc.xml
|
|
141
|
-
.idea/modules.xml
|
|
142
|
-
.idea/selector_schema_codegen.iml
|
|
143
|
-
.idea/vcs.xml
|
|
144
|
-
.idea/yh_parser.iml
|
|
145
|
-
.idea/inspectionProfiles/profiles_settings.xml
|
|
146
|
-
.idea/inspectionProfiles/Project_Default.xml
|
|
147
|
-
.idea/libraries/Dart_SDK.xml
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions / native shared libs
|
|
7
|
+
*.so
|
|
8
|
+
*.dll
|
|
9
|
+
*.dylib
|
|
10
|
+
*.exp
|
|
11
|
+
*.lib
|
|
12
|
+
|
|
13
|
+
# Distribution / packaging
|
|
14
|
+
.Python
|
|
15
|
+
build/
|
|
16
|
+
develop-eggs/
|
|
17
|
+
dist/
|
|
18
|
+
downloads/
|
|
19
|
+
eggs/
|
|
20
|
+
.eggs/
|
|
21
|
+
lib/
|
|
22
|
+
lib64/
|
|
23
|
+
parts/
|
|
24
|
+
sdist/
|
|
25
|
+
var/
|
|
26
|
+
wheels/
|
|
27
|
+
pip-wheel-metadata/
|
|
28
|
+
share/python-wheels/
|
|
29
|
+
.ruff_cache/
|
|
30
|
+
*.egg-info/
|
|
31
|
+
.installed.cfg
|
|
32
|
+
*.egg
|
|
33
|
+
MANIFEST
|
|
34
|
+
|
|
35
|
+
# PyInstaller
|
|
36
|
+
# Usually these files are written by a python script from a template
|
|
37
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
38
|
+
*.manifest
|
|
39
|
+
*.spec
|
|
40
|
+
|
|
41
|
+
# Installer logs
|
|
42
|
+
pip-log.txt
|
|
43
|
+
pip-delete-this-directory.txt
|
|
44
|
+
|
|
45
|
+
# Unit tests / coverage reports
|
|
46
|
+
htmlcov/
|
|
47
|
+
.tox/
|
|
48
|
+
.nox/
|
|
49
|
+
.coverage
|
|
50
|
+
.coverage.*
|
|
51
|
+
.cache
|
|
52
|
+
nosetests.xml
|
|
53
|
+
coverage.xml
|
|
54
|
+
*.cover
|
|
55
|
+
*.py,cover
|
|
56
|
+
.hypothesis/
|
|
57
|
+
.pytest_cache/
|
|
58
|
+
|
|
59
|
+
# Translations
|
|
60
|
+
*.mo
|
|
61
|
+
*.pot
|
|
62
|
+
|
|
63
|
+
# Django stuff:
|
|
64
|
+
*.log
|
|
65
|
+
local_settings.py
|
|
66
|
+
db.sqlite3
|
|
67
|
+
db.sqlite3-journal
|
|
68
|
+
|
|
69
|
+
# Flask stuff:
|
|
70
|
+
instance/
|
|
71
|
+
.webassets-cache
|
|
72
|
+
|
|
73
|
+
# Scrapy stuff:
|
|
74
|
+
.scrapy
|
|
75
|
+
|
|
76
|
+
# Sphinx documentation
|
|
77
|
+
docs/source/_build/
|
|
78
|
+
|
|
79
|
+
# PyBuilder
|
|
80
|
+
target/
|
|
81
|
+
|
|
82
|
+
# Jupyter Notebook
|
|
83
|
+
.ipynb_checkpoints
|
|
84
|
+
|
|
85
|
+
# IPython
|
|
86
|
+
profile_default/
|
|
87
|
+
ipython_config.py
|
|
88
|
+
|
|
89
|
+
# pyenv
|
|
90
|
+
.python-version
|
|
91
|
+
|
|
92
|
+
# pipenv
|
|
93
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
94
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
95
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
96
|
+
# install all needed dependencies.
|
|
97
|
+
#Pipfile.lock
|
|
98
|
+
|
|
99
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
100
|
+
__pypackages__/
|
|
101
|
+
|
|
102
|
+
# Celery stuff
|
|
103
|
+
celerybeat-schedule
|
|
104
|
+
celerybeat.pid
|
|
105
|
+
|
|
106
|
+
# SageMath parsed files
|
|
107
|
+
*.sage.py
|
|
108
|
+
|
|
109
|
+
# Environments
|
|
110
|
+
.env
|
|
111
|
+
.venv
|
|
112
|
+
env/
|
|
113
|
+
venv/
|
|
114
|
+
ENV/
|
|
115
|
+
env.bak/
|
|
116
|
+
venv.bak/
|
|
117
|
+
|
|
118
|
+
# Spyder project settings
|
|
119
|
+
.spyderproject
|
|
120
|
+
.spyproject
|
|
121
|
+
|
|
122
|
+
# Rope project settings
|
|
123
|
+
.ropeproject
|
|
124
|
+
|
|
125
|
+
# mkdocs documentation
|
|
126
|
+
/site
|
|
127
|
+
|
|
128
|
+
# mypy
|
|
129
|
+
.mypy_cache/
|
|
130
|
+
.dmypy.json
|
|
131
|
+
dmypy.json
|
|
132
|
+
|
|
133
|
+
# Pyre type checker
|
|
134
|
+
.pyre/
|
|
135
|
+
|
|
136
|
+
# project
|
|
137
|
+
test_schemas/
|
|
138
|
+
.idea/.gitignore
|
|
139
|
+
.idea/.name
|
|
140
|
+
.idea/misc.xml
|
|
141
|
+
.idea/modules.xml
|
|
142
|
+
.idea/selector_schema_codegen.iml
|
|
143
|
+
.idea/vcs.xml
|
|
144
|
+
.idea/yh_parser.iml
|
|
145
|
+
.idea/inspectionProfiles/profiles_settings.xml
|
|
146
|
+
.idea/inspectionProfiles/Project_Default.xml
|
|
147
|
+
.idea/libraries/Dart_SDK.xml
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2023 Georgiy
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Georgiy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
21
|
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ssc_codegen
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Python-dsl code converter to html parser for web scraping
|
|
5
5
|
Project-URL: Documentation, https://github.com/vypivshiy/selector_schema_codegen#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/vypivshiy/selector_schema_codegen/issues
|
|
@@ -86,7 +86,7 @@ ssc-gen generate examples/ -t js-pure -o ./output
|
|
|
86
86
|
ssc-gen generate schema.kdl -t go-goquery -o ./parsers --package scraper
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
-
Targets: `py-bs4`, `py-lxml`, `py-parsel`, `py-slax`, `js-pure`
|
|
89
|
+
Targets: `py-bs4`, `py-lxml`, `py-parsel`, `py-slax`, `js-pure`, `go-goquery`
|
|
90
90
|
|
|
91
91
|
### Lint schemas
|
|
92
92
|
|
|
@@ -1,124 +1,124 @@
|
|
|
1
|
-
# ssc-codegen
|
|
2
|
-
|
|
3
|
-
Code generator for web scraping parsers. Describe HTML extraction rules in a declarative KDL 2.0 DSL, then generate ready-to-use parser code for multiple languages and libraries.
|
|
4
|
-
|
|
5
|
-
```
|
|
6
|
-
.kdl schema --> [kdl parser] --> AST --> [linter] --> [converter] --> output code
|
|
7
|
-
```
|
|
8
|
-
|
|
9
|
-
## Features
|
|
10
|
-
|
|
11
|
-
- Declarative DSL based on KDL 2.0 syntax
|
|
12
|
-
- Static type checking and linting before code generation
|
|
13
|
-
- Multiple output targets: Python (bs4, lxml, parsel, selectolax), JavaScript (DOM API)
|
|
14
|
-
- Struct types: `item`, `list`, `dict`, `table`, `flat`
|
|
15
|
-
- LLM-friendly: system prompt + linter loop for AI-assisted schema generation
|
|
16
|
-
|
|
17
|
-
## Install
|
|
18
|
-
|
|
19
|
-
```bash
|
|
20
|
-
uv tool install ssc_codegen
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
## Quick example
|
|
24
|
-
|
|
25
|
-
`books.kdl`:
|
|
26
|
-
|
|
27
|
-
```kdl
|
|
28
|
-
struct Book type=list {
|
|
29
|
-
@split-doc { css-all ".product-card" }
|
|
30
|
-
|
|
31
|
-
title { css ".title"; text }
|
|
32
|
-
price { css ".price"; text; re #"(\d+\.\d+)"#; to-float }
|
|
33
|
-
url { css "a[href]"; attr "href"; fallback #null }
|
|
34
|
-
}
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
Generate Python parser:
|
|
38
|
-
|
|
39
|
-
```bash
|
|
40
|
-
ssc-gen generate books.kdl -t py-bs4 -o ./output
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
## Usage
|
|
44
|
-
|
|
45
|
-
### Generate code
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
# single file
|
|
49
|
-
ssc-gen generate schema.kdl -t py-bs4 -o ./output
|
|
50
|
-
|
|
51
|
-
# all .kdl files in a directory
|
|
52
|
-
ssc-gen generate examples/ -t js-pure -o ./output
|
|
53
|
-
|
|
54
|
-
# with custom package name (for Go and other targets)
|
|
55
|
-
ssc-gen generate schema.kdl -t go-goquery -o ./parsers --package scraper
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
Targets: `py-bs4`, `py-lxml`, `py-parsel`, `py-slax`, `js-pure`
|
|
59
|
-
|
|
60
|
-
### Lint schemas
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
# human-readable output
|
|
64
|
-
ssc-gen check schema.kdl
|
|
65
|
-
|
|
66
|
-
# JSON output (for LLM pipelines)
|
|
67
|
-
ssc-gen check schema.kdl -f json
|
|
68
|
-
|
|
69
|
-
# check all files in a directory
|
|
70
|
-
ssc-gen check examples/
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
### Test schema against HTML
|
|
74
|
-
|
|
75
|
-
```bash
|
|
76
|
-
# from file
|
|
77
|
-
ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4 -i page.html
|
|
78
|
-
|
|
79
|
-
# from stdin
|
|
80
|
-
curl https://books.toscrape.com/ | ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
### Health check (verify selectors match elements)
|
|
84
|
-
|
|
85
|
-
```bash
|
|
86
|
-
# from file
|
|
87
|
-
ssc-gen health examples/booksToScrape.kdl:MainCatalogue -i page.html
|
|
88
|
-
|
|
89
|
-
# from stdin
|
|
90
|
-
curl https://books.toscrape.com/ | ssc-gen health examples/booksToScrape.kdl:MainCatalogue
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
## Documentation
|
|
94
|
-
|
|
95
|
-
- [Quick start](docs2/guide.md)
|
|
96
|
-
- [Syntax and file structure](docs2/syntax.md)
|
|
97
|
-
- [Type system](docs2/types.md)
|
|
98
|
-
- [Pipeline operations](docs2/operations.md)
|
|
99
|
-
- [Predicates and logic](docs2/predicates.md)
|
|
100
|
-
- [JSON schemas and jsonify](docs2/json.md)
|
|
101
|
-
- [Transforms and dsl blocks](docs2/transforms.md)
|
|
102
|
-
- [LLM-compact reference](docs2/llm.txt) -- full DSL spec in one file for LLM context
|
|
103
|
-
- [Examples](examples/)
|
|
104
|
-
|
|
105
|
-
## LLM integration
|
|
106
|
-
|
|
107
|
-
LLM agents can generate and validate `.kdl` schemas automatically using the linter feedback loop.
|
|
108
|
-
|
|
109
|
-
### In chats (ChatGPT, Claude, etc.)
|
|
110
|
-
|
|
111
|
-
Use [SYSTEM_PROMPT.md](SYSTEM_PROMPT.md) as system prompt. After generation, run `ssc-gen check -f json` and send errors back to the LLM for correction.
|
|
112
|
-
|
|
113
|
-
### In AI-powered IDEs (Claude Code, Cursor, etc.)
|
|
114
|
-
|
|
115
|
-
Use the [kdl-schema-dsl](.agents/skills/kdl-schema-dsl) skill for automatic generation, validation, and iteration.
|
|
116
|
-
|
|
117
|
-
## Development
|
|
118
|
-
|
|
119
|
-
```bash
|
|
120
|
-
uv sync # install dependencies
|
|
121
|
-
uv build --wheel # build wheel
|
|
122
|
-
uv run pytest # run tests
|
|
123
|
-
uv run ruff check ssc_codegen/
|
|
124
|
-
```
|
|
1
|
+
# ssc-codegen
|
|
2
|
+
|
|
3
|
+
Code generator for web scraping parsers. Describe HTML extraction rules in a declarative KDL 2.0 DSL, then generate ready-to-use parser code for multiple languages and libraries.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
.kdl schema --> [kdl parser] --> AST --> [linter] --> [converter] --> output code
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- Declarative DSL based on KDL 2.0 syntax
|
|
12
|
+
- Static type checking and linting before code generation
|
|
13
|
+
- Multiple output targets: Python (bs4, lxml, parsel, selectolax), JavaScript (DOM API)
|
|
14
|
+
- Struct types: `item`, `list`, `dict`, `table`, `flat`
|
|
15
|
+
- LLM-friendly: system prompt + linter loop for AI-assisted schema generation
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
uv tool install ssc_codegen
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick example
|
|
24
|
+
|
|
25
|
+
`books.kdl`:
|
|
26
|
+
|
|
27
|
+
```kdl
|
|
28
|
+
struct Book type=list {
|
|
29
|
+
@split-doc { css-all ".product-card" }
|
|
30
|
+
|
|
31
|
+
title { css ".title"; text }
|
|
32
|
+
price { css ".price"; text; re #"(\d+\.\d+)"#; to-float }
|
|
33
|
+
url { css "a[href]"; attr "href"; fallback #null }
|
|
34
|
+
}
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Generate Python parser:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
ssc-gen generate books.kdl -t py-bs4 -o ./output
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Generate code
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# single file
|
|
49
|
+
ssc-gen generate schema.kdl -t py-bs4 -o ./output
|
|
50
|
+
|
|
51
|
+
# all .kdl files in a directory
|
|
52
|
+
ssc-gen generate examples/ -t js-pure -o ./output
|
|
53
|
+
|
|
54
|
+
# with custom package name (for Go and other targets)
|
|
55
|
+
ssc-gen generate schema.kdl -t go-goquery -o ./parsers --package scraper
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Targets: `py-bs4`, `py-lxml`, `py-parsel`, `py-slax`, `js-pure`, `go-goquery`
|
|
59
|
+
|
|
60
|
+
### Lint schemas
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# human-readable output
|
|
64
|
+
ssc-gen check schema.kdl
|
|
65
|
+
|
|
66
|
+
# JSON output (for LLM pipelines)
|
|
67
|
+
ssc-gen check schema.kdl -f json
|
|
68
|
+
|
|
69
|
+
# check all files in a directory
|
|
70
|
+
ssc-gen check examples/
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Test schema against HTML
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# from file
|
|
77
|
+
ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4 -i page.html
|
|
78
|
+
|
|
79
|
+
# from stdin
|
|
80
|
+
curl https://books.toscrape.com/ | ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Health check (verify selectors match elements)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# from file
|
|
87
|
+
ssc-gen health examples/booksToScrape.kdl:MainCatalogue -i page.html
|
|
88
|
+
|
|
89
|
+
# from stdin
|
|
90
|
+
curl https://books.toscrape.com/ | ssc-gen health examples/booksToScrape.kdl:MainCatalogue
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Documentation
|
|
94
|
+
|
|
95
|
+
- [Quick start](docs2/guide.md)
|
|
96
|
+
- [Syntax and file structure](docs2/syntax.md)
|
|
97
|
+
- [Type system](docs2/types.md)
|
|
98
|
+
- [Pipeline operations](docs2/operations.md)
|
|
99
|
+
- [Predicates and logic](docs2/predicates.md)
|
|
100
|
+
- [JSON schemas and jsonify](docs2/json.md)
|
|
101
|
+
- [Transforms and dsl blocks](docs2/transforms.md)
|
|
102
|
+
- [LLM-compact reference](docs2/llm.txt) -- full DSL spec in one file for LLM context
|
|
103
|
+
- [Examples](examples/)
|
|
104
|
+
|
|
105
|
+
## LLM integration
|
|
106
|
+
|
|
107
|
+
LLM agents can generate and validate `.kdl` schemas automatically using the linter feedback loop.
|
|
108
|
+
|
|
109
|
+
### In chats (ChatGPT, Claude, etc.)
|
|
110
|
+
|
|
111
|
+
Use [SYSTEM_PROMPT.md](SYSTEM_PROMPT.md) as system prompt. After generation, run `ssc-gen check -f json` and send errors back to the LLM for correction.
|
|
112
|
+
|
|
113
|
+
### In AI-powered IDEs (Claude Code, Cursor, etc.)
|
|
114
|
+
|
|
115
|
+
Use the [kdl-schema-dsl](.agents/skills/kdl-schema-dsl) skill for automatic generation, validation, and iteration.
|
|
116
|
+
|
|
117
|
+
## Development
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
uv sync # install dependencies
|
|
121
|
+
uv build --wheel # build wheel
|
|
122
|
+
uv run pytest # run tests
|
|
123
|
+
uv run ruff check ssc_codegen/
|
|
124
|
+
```
|