ssc_codegen 0.15.3__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ssc_codegen-0.15.3 → ssc_codegen-0.17.1}/.gitignore +5 -1
- ssc_codegen-0.17.1/PKG-INFO +155 -0
- ssc_codegen-0.17.1/README.md +124 -0
- {ssc_codegen-0.15.3 → ssc_codegen-0.17.1}/pyproject.toml +10 -14
- ssc_codegen-0.17.1/ssc_codegen/__init__.py +27 -0
- ssc_codegen-0.17.1/ssc_codegen/_logging.py +102 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/__init__.py +174 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/array.py +56 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/base.py +24 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/cast.py +72 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/control.py +82 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/extract.py +51 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/helpers.py +7 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/jsondef.py +37 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/module.py +92 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/predicate_containers.py +44 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/predicate_ops.py +309 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/regex.py +57 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/selectors.py +55 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/string.py +158 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/struct.py +232 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/transform.py +63 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/typedef.py +41 -0
- ssc_codegen-0.17.1/ssc_codegen/ast/types.py +79 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/base.py +327 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/helpers.py +76 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/js_pure.py +1445 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/py_bs4.py +1384 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/py_lxml.py +538 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/py_parsel.py +475 -0
- ssc_codegen-0.17.1/ssc_codegen/converters/py_slax.py +428 -0
- ssc_codegen-0.17.1/ssc_codegen/document_utils.py +107 -0
- ssc_codegen-0.17.1/ssc_codegen/exceptions.py +11 -0
- ssc_codegen-0.17.1/ssc_codegen/health.py +418 -0
- ssc_codegen-0.17.1/ssc_codegen/kdl/__init__.py +35 -0
- ssc_codegen-0.17.1/ssc_codegen/kdl/parser.py +954 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/__init__.py +26 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/_kdl_lang.py +415 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/base.py +818 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/errors.py +78 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/format_errors.py +285 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/metadata.py +18 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/navigation.py +262 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/path.py +36 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/rule_keywords.py +320 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/rules.py +848 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/rules_struct.py +571 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/type_rules.py +523 -0
- ssc_codegen-0.17.1/ssc_codegen/linter/types.py +188 -0
- ssc_codegen-0.17.1/ssc_codegen/main.py +616 -0
- ssc_codegen-0.17.1/ssc_codegen/parser.py +2298 -0
- ssc_codegen-0.17.1/ssc_codegen/pseudo_selectors.py +39 -0
- ssc_codegen-0.17.1/ssc_codegen/regex_utils.py +204 -0
- ssc_codegen-0.17.1/ssc_codegen/selector_utils.py +8 -0
- ssc_codegen-0.15.3/PKG-INFO +0 -255
- ssc_codegen-0.15.3/README.md +0 -220
- ssc_codegen-0.15.3/ssc_codegen/__init__.py +0 -99
- ssc_codegen-0.15.3/ssc_codegen/_compat.py +0 -31
- ssc_codegen-0.15.3/ssc_codegen/ast_/__init__.py +0 -131
- ssc_codegen-0.15.3/ssc_codegen/ast_/base.py +0 -242
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_array.py +0 -87
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_cast.py +0 -154
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_core.py +0 -591
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_filter.py +0 -659
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_selectors.py +0 -305
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_string.py +0 -495
- ssc_codegen-0.15.3/ssc_codegen/ast_/nodes_validate.py +0 -207
- ssc_codegen-0.15.3/ssc_codegen/ast_build/__init__.py +0 -1
- ssc_codegen-0.15.3/ssc_codegen/ast_build/builder.py +0 -464
- ssc_codegen-0.15.3/ssc_codegen/ast_build/main.py +0 -94
- ssc_codegen-0.15.3/ssc_codegen/ast_build/utils.py +0 -155
- ssc_codegen-0.15.3/ssc_codegen/ast_grep_rules/js_rules.yml +0 -20
- ssc_codegen-0.15.3/ssc_codegen/ast_grep_rules/py_drop_prefix_suffix_backport.yml +0 -47
- ssc_codegen-0.15.3/ssc_codegen/ast_grep_rules/py_rules.yml +0 -20
- ssc_codegen-0.15.3/ssc_codegen/cli/__init__.py +0 -0
- ssc_codegen-0.15.3/ssc_codegen/cli/ast_grep.py +0 -10
- ssc_codegen-0.15.3/ssc_codegen/cli/cli_callbacks.py +0 -34
- ssc_codegen-0.15.3/ssc_codegen/cli/cli_utils.py +0 -84
- ssc_codegen-0.15.3/ssc_codegen/cli/code_callbacks.py +0 -32
- ssc_codegen-0.15.3/ssc_codegen/cli/consts.py +0 -61
- ssc_codegen-0.15.3/ssc_codegen/cli/main.py +0 -643
- ssc_codegen-0.15.3/ssc_codegen/cli/runtime_parse_runners.py +0 -72
- ssc_codegen-0.15.3/ssc_codegen/compiler.py +0 -113
- ssc_codegen-0.15.3/ssc_codegen/converters/__init__.py +0 -0
- ssc_codegen-0.15.3/ssc_codegen/converters/base.py +0 -577
- ssc_codegen-0.15.3/ssc_codegen/converters/go_goquery.py +0 -2022
- ssc_codegen-0.15.3/ssc_codegen/converters/helpers.py +0 -257
- ssc_codegen-0.15.3/ssc_codegen/converters/js_pure.py +0 -1534
- ssc_codegen-0.15.3/ssc_codegen/converters/lua_htmlparser.py +0 -1510
- ssc_codegen-0.15.3/ssc_codegen/converters/py_base.py +0 -1359
- ssc_codegen-0.15.3/ssc_codegen/converters/py_bs4.py +0 -504
- ssc_codegen-0.15.3/ssc_codegen/converters/py_lxml.py +0 -551
- ssc_codegen-0.15.3/ssc_codegen/converters/py_parsel.py +0 -533
- ssc_codegen-0.15.3/ssc_codegen/converters/py_selectolax.py +0 -501
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/__init__.py +0 -3
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/go_goquery.py +0 -473
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/js_pure.py +0 -38
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/lua_base.py +0 -710
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/lua_css_compat.py +0 -157
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/lua_re_compat.py +0 -517
- ssc_codegen-0.15.3/ssc_codegen/converters/templates/py_base.py +0 -60
- ssc_codegen-0.15.3/ssc_codegen/document.py +0 -2661
- ssc_codegen-0.15.3/ssc_codegen/document_utlis.py +0 -201
- ssc_codegen-0.15.3/ssc_codegen/json_struct.py +0 -272
- ssc_codegen-0.15.3/ssc_codegen/json_to_scc.py +0 -180
- ssc_codegen-0.15.3/ssc_codegen/logs.py +0 -49
- ssc_codegen-0.15.3/ssc_codegen/pseudo_selectors.py +0 -70
- ssc_codegen-0.15.3/ssc_codegen/schema.py +0 -368
- ssc_codegen-0.15.3/ssc_codegen/selector_utils.py +0 -78
- ssc_codegen-0.15.3/ssc_codegen/static_checker/__init__.py +0 -62
- ssc_codegen-0.15.3/ssc_codegen/static_checker/base.py +0 -91
- ssc_codegen-0.15.3/ssc_codegen/static_checker/callbacks.py +0 -484
- ssc_codegen-0.15.3/ssc_codegen/str_utils.py +0 -230
- ssc_codegen-0.15.3/ssc_codegen/tokens.py +0 -291
- ssc_codegen-0.15.3/ssc_codegen/transform.py +0 -133
- {ssc_codegen-0.15.3 → ssc_codegen-0.17.1}/LICENSE +0 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ssc_codegen
|
|
3
|
+
Version: 0.17.1
|
|
4
|
+
Summary: Python-dsl code converter to html parser for web scraping
|
|
5
|
+
Project-URL: Documentation, https://github.com/vypivshiy/selector_schema_codegen#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/vypivshiy/selector_schema_codegen/issues
|
|
7
|
+
Project-URL: Source, https://github.com/vypivshiy/selector_schema_codegen
|
|
8
|
+
Project-URL: Examples, https://github.com/vypivshiy/selector_schema_codegen/examples
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Classifier: Topic :: Utilities
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: bs4>=0.0.2
|
|
23
|
+
Requires-Dist: click<8.2.0
|
|
24
|
+
Requires-Dist: colorama>=0.4.6; sys_platform == 'win32'
|
|
25
|
+
Requires-Dist: cssselect>=1.2.0
|
|
26
|
+
Requires-Dist: lxml>=5.3.0
|
|
27
|
+
Requires-Dist: soupsieve>=2.6
|
|
28
|
+
Requires-Dist: typer>=0.15.1
|
|
29
|
+
Requires-Dist: typing-extensions; python_version < '3.11'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# ssc-codegen
|
|
33
|
+
|
|
34
|
+
Code generator for web scraping parsers. Describe HTML extraction rules in a declarative KDL 2.0 DSL, then generate ready-to-use parser code for multiple languages and libraries.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
.kdl schema --> [kdl parser] --> AST --> [linter] --> [converter] --> output code
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- Declarative DSL based on KDL 2.0 syntax
|
|
43
|
+
- Static type checking and linting before code generation
|
|
44
|
+
- Multiple output targets: Python (bs4, lxml, parsel, selectolax), JavaScript (DOM API)
|
|
45
|
+
- Struct types: `item`, `list`, `dict`, `table`, `flat`
|
|
46
|
+
- LLM-friendly: system prompt + linter loop for AI-assisted schema generation
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uv tool install ssc_codegen
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quick example
|
|
55
|
+
|
|
56
|
+
`books.kdl`:
|
|
57
|
+
|
|
58
|
+
```kdl
|
|
59
|
+
struct Book type=list {
|
|
60
|
+
@split-doc { css-all ".product-card" }
|
|
61
|
+
|
|
62
|
+
title { css ".title"; text }
|
|
63
|
+
price { css ".price"; text; re #"(\d+\.\d+)"#; to-float }
|
|
64
|
+
url { css "a[href]"; attr "href"; fallback #null }
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Generate Python parser:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
ssc-gen generate books.kdl -t py-bs4 -o ./output
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Usage
|
|
75
|
+
|
|
76
|
+
### Generate code
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# single file
|
|
80
|
+
ssc-gen generate schema.kdl -t py-bs4 -o ./output
|
|
81
|
+
|
|
82
|
+
# all .kdl files in a directory
|
|
83
|
+
ssc-gen generate examples/ -t js-pure -o ./output
|
|
84
|
+
|
|
85
|
+
# with custom package name (for Go and other targets)
|
|
86
|
+
ssc-gen generate schema.kdl -t go-goquery -o ./parsers --package scraper
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Targets: `py-bs4`, `py-lxml`, `py-parsel`, `py-slax`, `js-pure`
|
|
90
|
+
|
|
91
|
+
### Lint schemas
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# human-readable output
|
|
95
|
+
ssc-gen check schema.kdl
|
|
96
|
+
|
|
97
|
+
# JSON output (for LLM pipelines)
|
|
98
|
+
ssc-gen check schema.kdl -f json
|
|
99
|
+
|
|
100
|
+
# check all files in a directory
|
|
101
|
+
ssc-gen check examples/
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Test schema against HTML
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# from file
|
|
108
|
+
ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4 -i page.html
|
|
109
|
+
|
|
110
|
+
# from stdin
|
|
111
|
+
curl https://books.toscrape.com/ | ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Health check (verify selectors match elements)
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
# from file
|
|
118
|
+
ssc-gen health examples/booksToScrape.kdl:MainCatalogue -i page.html
|
|
119
|
+
|
|
120
|
+
# from stdin
|
|
121
|
+
curl https://books.toscrape.com/ | ssc-gen health examples/booksToScrape.kdl:MainCatalogue
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Documentation
|
|
125
|
+
|
|
126
|
+
- [Quick start](docs2/guide.md)
|
|
127
|
+
- [Syntax and file structure](docs2/syntax.md)
|
|
128
|
+
- [Type system](docs2/types.md)
|
|
129
|
+
- [Pipeline operations](docs2/operations.md)
|
|
130
|
+
- [Predicates and logic](docs2/predicates.md)
|
|
131
|
+
- [JSON schemas and jsonify](docs2/json.md)
|
|
132
|
+
- [Transforms and dsl blocks](docs2/transforms.md)
|
|
133
|
+
- [LLM-compact reference](docs2/llm.txt) -- full DSL spec in one file for LLM context
|
|
134
|
+
- [Examples](examples/)
|
|
135
|
+
|
|
136
|
+
## LLM integration
|
|
137
|
+
|
|
138
|
+
LLM agents can generate and validate `.kdl` schemas automatically using the linter feedback loop.
|
|
139
|
+
|
|
140
|
+
### In chats (ChatGPT, Claude, etc.)
|
|
141
|
+
|
|
142
|
+
Use [SYSTEM_PROMPT.md](SYSTEM_PROMPT.md) as system prompt. After generation, run `ssc-gen check -f json` and send errors back to the LLM for correction.
|
|
143
|
+
|
|
144
|
+
### In AI-powered IDEs (Claude Code, Cursor, etc.)
|
|
145
|
+
|
|
146
|
+
Use the [kdl-schema-dsl](.agents/skills/kdl-schema-dsl) skill for automatic generation, validation, and iteration.
|
|
147
|
+
|
|
148
|
+
## Development
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
uv sync # install dependencies
|
|
152
|
+
uv build --wheel # build wheel
|
|
153
|
+
uv run pytest # run tests
|
|
154
|
+
uv run ruff check ssc_codegen/
|
|
155
|
+
```
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# ssc-codegen
|
|
2
|
+
|
|
3
|
+
Code generator for web scraping parsers. Describe HTML extraction rules in a declarative KDL 2.0 DSL, then generate ready-to-use parser code for multiple languages and libraries.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
.kdl schema --> [kdl parser] --> AST --> [linter] --> [converter] --> output code
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- Declarative DSL based on KDL 2.0 syntax
|
|
12
|
+
- Static type checking and linting before code generation
|
|
13
|
+
- Multiple output targets: Python (bs4, lxml, parsel, selectolax), JavaScript (DOM API)
|
|
14
|
+
- Struct types: `item`, `list`, `dict`, `table`, `flat`
|
|
15
|
+
- LLM-friendly: system prompt + linter loop for AI-assisted schema generation
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
uv tool install ssc_codegen
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick example
|
|
24
|
+
|
|
25
|
+
`books.kdl`:
|
|
26
|
+
|
|
27
|
+
```kdl
|
|
28
|
+
struct Book type=list {
|
|
29
|
+
@split-doc { css-all ".product-card" }
|
|
30
|
+
|
|
31
|
+
title { css ".title"; text }
|
|
32
|
+
price { css ".price"; text; re #"(\d+\.\d+)"#; to-float }
|
|
33
|
+
url { css "a[href]"; attr "href"; fallback #null }
|
|
34
|
+
}
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Generate Python parser:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
ssc-gen generate books.kdl -t py-bs4 -o ./output
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Generate code
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# single file
|
|
49
|
+
ssc-gen generate schema.kdl -t py-bs4 -o ./output
|
|
50
|
+
|
|
51
|
+
# all .kdl files in a directory
|
|
52
|
+
ssc-gen generate examples/ -t js-pure -o ./output
|
|
53
|
+
|
|
54
|
+
# with custom package name (for Go and other targets)
|
|
55
|
+
ssc-gen generate schema.kdl -t go-goquery -o ./parsers --package scraper
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Targets: `py-bs4`, `py-lxml`, `py-parsel`, `py-slax`, `js-pure`
|
|
59
|
+
|
|
60
|
+
### Lint schemas
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# human-readable output
|
|
64
|
+
ssc-gen check schema.kdl
|
|
65
|
+
|
|
66
|
+
# JSON output (for LLM pipelines)
|
|
67
|
+
ssc-gen check schema.kdl -f json
|
|
68
|
+
|
|
69
|
+
# check all files in a directory
|
|
70
|
+
ssc-gen check examples/
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Test schema against HTML
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# from file
|
|
77
|
+
ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4 -i page.html
|
|
78
|
+
|
|
79
|
+
# from stdin
|
|
80
|
+
curl https://books.toscrape.com/ | ssc-gen run examples/booksToScrape.kdl:MainCatalogue -t py-bs4
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Health check (verify selectors match elements)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# from file
|
|
87
|
+
ssc-gen health examples/booksToScrape.kdl:MainCatalogue -i page.html
|
|
88
|
+
|
|
89
|
+
# from stdin
|
|
90
|
+
curl https://books.toscrape.com/ | ssc-gen health examples/booksToScrape.kdl:MainCatalogue
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Documentation
|
|
94
|
+
|
|
95
|
+
- [Quick start](docs2/guide.md)
|
|
96
|
+
- [Syntax and file structure](docs2/syntax.md)
|
|
97
|
+
- [Type system](docs2/types.md)
|
|
98
|
+
- [Pipeline operations](docs2/operations.md)
|
|
99
|
+
- [Predicates and logic](docs2/predicates.md)
|
|
100
|
+
- [JSON schemas and jsonify](docs2/json.md)
|
|
101
|
+
- [Transforms and dsl blocks](docs2/transforms.md)
|
|
102
|
+
- [LLM-compact reference](docs2/llm.txt) -- full DSL spec in one file for LLM context
|
|
103
|
+
- [Examples](examples/)
|
|
104
|
+
|
|
105
|
+
## LLM integration
|
|
106
|
+
|
|
107
|
+
LLM agents can generate and validate `.kdl` schemas automatically using the linter feedback loop.
|
|
108
|
+
|
|
109
|
+
### In chats (ChatGPT, Claude, etc.)
|
|
110
|
+
|
|
111
|
+
Use [SYSTEM_PROMPT.md](SYSTEM_PROMPT.md) as system prompt. After generation, run `ssc-gen check -f json` and send errors back to the LLM for correction.
|
|
112
|
+
|
|
113
|
+
### In AI-powered IDEs (Claude Code, Cursor, etc.)
|
|
114
|
+
|
|
115
|
+
Use the [kdl-schema-dsl](.agents/skills/kdl-schema-dsl) skill for automatic generation, validation, and iteration.
|
|
116
|
+
|
|
117
|
+
## Development
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
uv sync # install dependencies
|
|
121
|
+
uv build --wheel # build wheel
|
|
122
|
+
uv run pytest # run tests
|
|
123
|
+
uv run ruff check ssc_codegen/
|
|
124
|
+
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ssc_codegen"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.17.1"
|
|
4
4
|
description = "Python-dsl code converter to html parser for web scraping "
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
@@ -8,8 +8,6 @@ dependencies = [
|
|
|
8
8
|
"bs4>=0.0.2",
|
|
9
9
|
"colorama>=0.4.6 ; sys_platform == 'win32'",
|
|
10
10
|
"cssselect>=1.2.0",
|
|
11
|
-
"httpx>=0.28.1",
|
|
12
|
-
"ichrome>=4.0.4",
|
|
13
11
|
"lxml>=5.3.0",
|
|
14
12
|
"soupsieve>=2.6",
|
|
15
13
|
"typer>=0.15.1",
|
|
@@ -17,8 +15,6 @@ dependencies = [
|
|
|
17
15
|
# https://github.com/fastapi/typer/discussions/1215
|
|
18
16
|
# https://github.com/fastapi/typer/pull/1145
|
|
19
17
|
"click<8.2.0",
|
|
20
|
-
"ast-grep-cli>=0.38.1",
|
|
21
|
-
"tinycss2>=1.4.0",
|
|
22
18
|
]
|
|
23
19
|
|
|
24
20
|
classifiers = [
|
|
@@ -43,7 +39,7 @@ Examples = "https://github.com/vypivshiy/selector_schema_codegen/examples"
|
|
|
43
39
|
|
|
44
40
|
|
|
45
41
|
[project.scripts]
|
|
46
|
-
ssc-gen = 'ssc_codegen.
|
|
42
|
+
ssc-gen = 'ssc_codegen.main:main'
|
|
47
43
|
|
|
48
44
|
[build-system]
|
|
49
45
|
requires = ["hatchling"]
|
|
@@ -51,25 +47,25 @@ build-backend = "hatchling.build"
|
|
|
51
47
|
|
|
52
48
|
[tool.hatch.build.targets.sdist]
|
|
53
49
|
include = [
|
|
54
|
-
"ssc_codegen
|
|
50
|
+
"ssc_codegen/**",
|
|
55
51
|
]
|
|
56
52
|
|
|
53
|
+
[tool.hatch.build.targets.wheel]
|
|
54
|
+
packages = ["ssc_codegen"]
|
|
55
|
+
|
|
57
56
|
[dependency-groups]
|
|
58
57
|
dev = [
|
|
58
|
+
"bs4>=0.0.2",
|
|
59
59
|
"coverage>=7.6.12",
|
|
60
60
|
"httpx>=0.28.1",
|
|
61
|
+
"hypothesis>=6.151.9",
|
|
62
|
+
"lxml>=5.3.0",
|
|
61
63
|
"mypy>=1.14.1",
|
|
62
|
-
"parsel>=1.
|
|
64
|
+
"parsel>=1.9.1",
|
|
63
65
|
"pytest>=8.3.4",
|
|
64
66
|
"ruff>=0.9.3",
|
|
65
67
|
"selectolax>=0.3.27",
|
|
66
68
|
]
|
|
67
|
-
# python generated code tests
|
|
68
|
-
tests = [
|
|
69
|
-
"bs4>=0.0.2",
|
|
70
|
-
"parsel>=1.10.0",
|
|
71
|
-
"selectolax>=0.3.27",
|
|
72
|
-
]
|
|
73
69
|
|
|
74
70
|
[tool.ruff]
|
|
75
71
|
target-version = "py310"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from ssc_codegen.parser import Module, PARSER
|
|
4
|
+
|
|
5
|
+
_KDL_TEXT_ENCODING = "utf-8-sig"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_ast(
|
|
9
|
+
src: str | None = None,
|
|
10
|
+
path: str | None = None,
|
|
11
|
+
*,
|
|
12
|
+
css_to_xpath: bool = False,
|
|
13
|
+
) -> Module:
|
|
14
|
+
if not src and not path:
|
|
15
|
+
raise AttributeError("required src or path argument")
|
|
16
|
+
source_path: Path | None = None
|
|
17
|
+
if path:
|
|
18
|
+
source_path = Path(path).resolve()
|
|
19
|
+
src = source_path.read_text(encoding=_KDL_TEXT_ENCODING)
|
|
20
|
+
if not src:
|
|
21
|
+
raise AttributeError("required src or path argument")
|
|
22
|
+
module = PARSER.parse(src, source_path=source_path)
|
|
23
|
+
if css_to_xpath:
|
|
24
|
+
from ssc_codegen.document_utils import convert_css_to_xpath_module
|
|
25
|
+
|
|
26
|
+
convert_css_to_xpath_module(module)
|
|
27
|
+
return module
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Logging configuration for ssc_codegen.
|
|
2
|
+
|
|
3
|
+
Usage
|
|
4
|
+
-----
|
|
5
|
+
Import the logger in any submodule::
|
|
6
|
+
|
|
7
|
+
from ssc_codegen._logging import logger
|
|
8
|
+
|
|
9
|
+
To enable DEBUG output from the CLI, pass ``--verbose`` / ``-v`` flag,
|
|
10
|
+
or configure it manually::
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
logging.getLogger("ssc_codegen").setLevel(logging.DEBUG)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
# Windows ANSI backport
|
|
22
|
+
try:
|
|
23
|
+
import colorama
|
|
24
|
+
|
|
25
|
+
colorama.init(autoreset=False)
|
|
26
|
+
_COLORAMA_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
_COLORAMA_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
# ANSI color codes (used on all platforms; colorama translates them on Windows)
|
|
31
|
+
_RESET = "\033[0m"
|
|
32
|
+
_BOLD = "\033[1m"
|
|
33
|
+
_COLORS: dict[int, str] = {
|
|
34
|
+
logging.DEBUG: "\033[36m", # cyan
|
|
35
|
+
logging.INFO: "\033[32m", # green
|
|
36
|
+
logging.WARNING: "\033[33m", # yellow
|
|
37
|
+
logging.ERROR: "\033[31m", # red
|
|
38
|
+
logging.CRITICAL: "\033[35m", # magenta
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _ColorFormatter(logging.Formatter):
|
|
43
|
+
"""Formatter that wraps the level name in ANSI color codes."""
|
|
44
|
+
|
|
45
|
+
_FMT = "[{color}{bold}{level}{reset}] {name}: {message}"
|
|
46
|
+
|
|
47
|
+
def format(self, record: logging.LogRecord) -> str: # noqa: A003
|
|
48
|
+
color = _COLORS.get(record.levelno, "")
|
|
49
|
+
level = record.levelname
|
|
50
|
+
name = record.name
|
|
51
|
+
# format the message part the normal way (handles exc_info etc.)
|
|
52
|
+
record.message = record.getMessage()
|
|
53
|
+
if record.exc_info and not record.exc_text:
|
|
54
|
+
record.exc_text = self.formatException(record.exc_info)
|
|
55
|
+
|
|
56
|
+
msg = self._FMT.format(
|
|
57
|
+
color=color,
|
|
58
|
+
bold=_BOLD,
|
|
59
|
+
level=level,
|
|
60
|
+
reset=_RESET,
|
|
61
|
+
name=name,
|
|
62
|
+
message=record.message,
|
|
63
|
+
)
|
|
64
|
+
if record.exc_text:
|
|
65
|
+
msg = f"{msg}\n{record.exc_text}"
|
|
66
|
+
return msg
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Single named logger for the entire ssc_codegen package.
|
|
70
|
+
# All child loggers (parser, main, …) are children of this one,
|
|
71
|
+
# so a single ``logging.getLogger("ssc_codegen").setLevel(DEBUG)``
|
|
72
|
+
# enables everything at once.
|
|
73
|
+
logger = logging.getLogger("ssc_codegen")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def setup_debug_logging() -> None:
|
|
77
|
+
"""Enable DEBUG-level logging to stderr for the ssc_codegen logger.
|
|
78
|
+
|
|
79
|
+
Called by the CLI when ``--verbose`` is passed.
|
|
80
|
+
Idempotent: calling multiple times is safe.
|
|
81
|
+
"""
|
|
82
|
+
pkg_logger = logging.getLogger("ssc_codegen")
|
|
83
|
+
if pkg_logger.level > logging.DEBUG or pkg_logger.level == logging.NOTSET:
|
|
84
|
+
pkg_logger.setLevel(logging.DEBUG)
|
|
85
|
+
|
|
86
|
+
# Avoid adding duplicate handlers if already configured
|
|
87
|
+
if not any(
|
|
88
|
+
isinstance(h, logging.StreamHandler) for h in pkg_logger.handlers
|
|
89
|
+
):
|
|
90
|
+
# Use colors only when stderr is a real TTY or colorama is available
|
|
91
|
+
use_color = _COLORAMA_AVAILABLE or (
|
|
92
|
+
hasattr(sys.stderr, "isatty") and sys.stderr.isatty()
|
|
93
|
+
)
|
|
94
|
+
handler = logging.StreamHandler()
|
|
95
|
+
handler.setLevel(logging.DEBUG)
|
|
96
|
+
formatter: logging.Formatter = (
|
|
97
|
+
_ColorFormatter()
|
|
98
|
+
if use_color
|
|
99
|
+
else logging.Formatter("[%(levelname)s] %(name)s: %(message)s")
|
|
100
|
+
)
|
|
101
|
+
handler.setFormatter(formatter)
|
|
102
|
+
pkg_logger.addHandler(handler)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AST nodes for the KDL Schema DSL.
|
|
3
|
+
|
|
4
|
+
Import everything from here:
|
|
5
|
+
from kdl_ast import Module, Field, CssSelect, ...
|
|
6
|
+
"""
|
|
7
|
+
from .types import VariableType, StructType
|
|
8
|
+
|
|
9
|
+
from .base import Node
|
|
10
|
+
|
|
11
|
+
from .module import (
|
|
12
|
+
Module,
|
|
13
|
+
CodeStartHook,
|
|
14
|
+
CodeEndHook,
|
|
15
|
+
Docstring,
|
|
16
|
+
Imports,
|
|
17
|
+
Utilities,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from .typedef import TypeDef, TypeDefField
|
|
21
|
+
|
|
22
|
+
from .jsondef import JsonDef, JsonDefField
|
|
23
|
+
|
|
24
|
+
from .struct import (
|
|
25
|
+
Struct,
|
|
26
|
+
StructDocstring,
|
|
27
|
+
PreValidate,
|
|
28
|
+
Init,
|
|
29
|
+
InitField,
|
|
30
|
+
SplitDoc,
|
|
31
|
+
Key,
|
|
32
|
+
Value,
|
|
33
|
+
TableConfig,
|
|
34
|
+
TableRow,
|
|
35
|
+
TableMatchKey,
|
|
36
|
+
Field,
|
|
37
|
+
StartParse
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
from .selectors import (
|
|
41
|
+
CssSelect,
|
|
42
|
+
CssSelectAll,
|
|
43
|
+
XpathSelect,
|
|
44
|
+
XpathSelectAll,
|
|
45
|
+
CssRemove,
|
|
46
|
+
XpathRemove,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
from .extract import Text, Raw, Attr
|
|
50
|
+
|
|
51
|
+
from .string import (
|
|
52
|
+
Trim,
|
|
53
|
+
Ltrim,
|
|
54
|
+
Rtrim,
|
|
55
|
+
NormalizeSpace,
|
|
56
|
+
RmPrefix,
|
|
57
|
+
RmSuffix,
|
|
58
|
+
RmPrefixSuffix,
|
|
59
|
+
Fmt,
|
|
60
|
+
Repl,
|
|
61
|
+
ReplMap,
|
|
62
|
+
Lower,
|
|
63
|
+
Upper,
|
|
64
|
+
Split,
|
|
65
|
+
Join,
|
|
66
|
+
Unescape,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
from .regex import Re, ReAll, ReSub
|
|
70
|
+
|
|
71
|
+
from .array import Index, Slice, Len, Unique
|
|
72
|
+
|
|
73
|
+
from .cast import ToInt, ToFloat, ToBool, Jsonify, Nested
|
|
74
|
+
|
|
75
|
+
from .control import Self, Fallback, FallbackStart, FallbackEnd, Return
|
|
76
|
+
|
|
77
|
+
from .predicate_containers import Filter, Assert, Match
|
|
78
|
+
|
|
79
|
+
from .predicate_ops import (
|
|
80
|
+
PredEq,
|
|
81
|
+
PredNe,
|
|
82
|
+
PredGt,
|
|
83
|
+
PredLt,
|
|
84
|
+
PredGe,
|
|
85
|
+
PredLe,
|
|
86
|
+
PredRange,
|
|
87
|
+
PredStarts,
|
|
88
|
+
PredEnds,
|
|
89
|
+
PredContains,
|
|
90
|
+
PredIn,
|
|
91
|
+
PredRe,
|
|
92
|
+
PredReAny,
|
|
93
|
+
PredReAll,
|
|
94
|
+
PredCss,
|
|
95
|
+
PredXpath,
|
|
96
|
+
PredHasAttr,
|
|
97
|
+
PredCountEq,
|
|
98
|
+
PredCountGt,
|
|
99
|
+
PredCountLt,
|
|
100
|
+
PredCountNe,
|
|
101
|
+
PredCountGe,
|
|
102
|
+
PredCountLe,
|
|
103
|
+
PredCountRange,
|
|
104
|
+
PredAttrEnds,
|
|
105
|
+
PredAttrEq,
|
|
106
|
+
PredAttrNe,
|
|
107
|
+
PredAttrRe,
|
|
108
|
+
PredAttrStarts,
|
|
109
|
+
PredAttrContains,
|
|
110
|
+
PredTextContains,
|
|
111
|
+
PredTextEnds,
|
|
112
|
+
PredTextRe,
|
|
113
|
+
PredTextStarts,
|
|
114
|
+
LogicNot,
|
|
115
|
+
LogicAnd,
|
|
116
|
+
LogicOr,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
from .transform import TransformDef, TransformTarget, TransformCall
|
|
120
|
+
|
|
121
|
+
__all__ = [
|
|
122
|
+
# types
|
|
123
|
+
"VariableType", "StructType",
|
|
124
|
+
# base
|
|
125
|
+
"Node",
|
|
126
|
+
# module
|
|
127
|
+
"Module", "CodeStartHook", "CodeEndHook",
|
|
128
|
+
"Docstring", "Imports", "Utilities",
|
|
129
|
+
# typedef
|
|
130
|
+
"TypeDef", "TypeDefField",
|
|
131
|
+
# jsondef
|
|
132
|
+
"JsonDef", "JsonDefField",
|
|
133
|
+
# struct
|
|
134
|
+
"Struct", "StructDocstring", "PreValidate",
|
|
135
|
+
"Init", "InitField", "SplitDoc",
|
|
136
|
+
"Key", "Value",
|
|
137
|
+
"TableConfig", "TableRow", "TableMatchKey",
|
|
138
|
+
"Field", "StartParse",
|
|
139
|
+
# selectors
|
|
140
|
+
"CssSelect", "CssSelectAll",
|
|
141
|
+
"XpathSelect", "XpathSelectAll",
|
|
142
|
+
"CssRemove", "XpathRemove",
|
|
143
|
+
# extract
|
|
144
|
+
"Text", "Raw", "Attr",
|
|
145
|
+
# string
|
|
146
|
+
"Trim", "Ltrim", "Rtrim", "NormalizeSpace",
|
|
147
|
+
"RmPrefix", "RmSuffix", "RmPrefixSuffix",
|
|
148
|
+
"Fmt", "Repl", "ReplMap",
|
|
149
|
+
"Lower", "Upper", "Split", "Join", "Unescape",
|
|
150
|
+
# regex
|
|
151
|
+
"Re", "ReAll", "ReSub",
|
|
152
|
+
# array
|
|
153
|
+
"Index", "Slice", "Len", "Unique",
|
|
154
|
+
# cast
|
|
155
|
+
"ToInt", "ToFloat", "ToBool", "Jsonify", "Nested",
|
|
156
|
+
# control
|
|
157
|
+
"Self", "Fallback", "FallbackStart", "FallbackEnd", "Return",
|
|
158
|
+
# predicate containers
|
|
159
|
+
"Filter", "Assert", "Match",
|
|
160
|
+
# predicate ops
|
|
161
|
+
"PredEq", "PredNe",
|
|
162
|
+
"PredGt", "PredLt", "PredGe", "PredLe", "PredRange",
|
|
163
|
+
"PredStarts", "PredEnds", "PredContains", "PredIn",
|
|
164
|
+
"PredRe", "PredReAny", "PredReAll",
|
|
165
|
+
"PredCss", "PredXpath", "PredHasAttr",
|
|
166
|
+
"PredAttrEq", "PredAttrNe",
|
|
167
|
+
"PredAttrStarts", "PredAttrEnds", "PredAttrContains", "PredAttrRe",
|
|
168
|
+
"PredTextStarts", "PredTextEnds", "PredTextContains", "PredTextRe",
|
|
169
|
+
"PredCountEq", "PredCountGt", "PredCountLt",
|
|
170
|
+
"PredCountNe", "PredCountGe", "PredCountLe", "PredCountRange",
|
|
171
|
+
"LogicNot", "LogicAnd", "LogicOr",
|
|
172
|
+
# transform
|
|
173
|
+
"TransformDef", "TransformTarget", "TransformCall",
|
|
174
|
+
]
|