chadselect 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chadselect-0.2.0/.gitignore +28 -0
- chadselect-0.2.0/PKG-INFO +113 -0
- chadselect-0.2.0/README.md +80 -0
- chadselect-0.2.0/pyproject.toml +54 -0
- chadselect-0.2.0/src/chadselect/__init__.py +40 -0
- chadselect-0.2.0/src/chadselect/_chadselect.py +218 -0
- chadselect-0.2.0/src/chadselect/_functions.py +134 -0
- chadselect-0.2.0/src/chadselect/_query.py +69 -0
- chadselect-0.2.0/src/chadselect/engine/__init__.py +1 -0
- chadselect-0.2.0/src/chadselect/engine/css.py +135 -0
- chadselect-0.2.0/src/chadselect/engine/json.py +66 -0
- chadselect-0.2.0/src/chadselect/engine/regex.py +49 -0
- chadselect-0.2.0/src/chadselect/engine/xpath.py +48 -0
- chadselect-0.2.0/src/chadselect/py.typed +23 -0
- chadselect-0.2.0/tests/bench.py +238 -0
- chadselect-0.2.0/tests/test_chadselect.py +776 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Rust
|
|
2
|
+
**/target/
|
|
3
|
+
**/*.rs.bk
|
|
4
|
+
Cargo.lock
|
|
5
|
+
|
|
6
|
+
# Python
|
|
7
|
+
__pycache__/
|
|
8
|
+
*.py[cod]
|
|
9
|
+
*$py.class
|
|
10
|
+
*.egg-info/
|
|
11
|
+
dist/
|
|
12
|
+
build/
|
|
13
|
+
*.egg
|
|
14
|
+
.venv/
|
|
15
|
+
|
|
16
|
+
# IDE / Editor
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
*~
|
|
22
|
+
.DS_Store
|
|
23
|
+
Thumbs.db
|
|
24
|
+
|
|
25
|
+
# Environment
|
|
26
|
+
.env
|
|
27
|
+
.env.*
|
|
28
|
+
.venv/*
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chadselect
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Unified data extraction — CSS, XPath, Regex, and JMESPath behind one query interface.
|
|
5
|
+
Project-URL: Homepage, https://github.com/markjacksoncerberus/chadselect
|
|
6
|
+
Project-URL: Repository, https://github.com/markjacksoncerberus/chadselect
|
|
7
|
+
Project-URL: Issues, https://github.com/markjacksoncerberus/chadselect/issues
|
|
8
|
+
Author: Mark Jackson
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: css,extraction,jmespath,parsing,regex,scraping,xpath
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.15
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
23
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Requires-Dist: jmespath>=1.0
|
|
27
|
+
Requires-Dist: lxml>=5.0
|
|
28
|
+
Requires-Dist: selectolax>=0.3.21
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# ChadSelect
|
|
35
|
+
|
|
36
|
+
Unified data extraction — CSS, XPath, Regex, and JMESPath behind one query interface.
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from chadselect import ChadSelect
|
|
40
|
+
|
|
41
|
+
cs = ChadSelect()
|
|
42
|
+
cs.add_html(html)
|
|
43
|
+
cs.add_json(json_str)
|
|
44
|
+
|
|
45
|
+
# One syntax, four engines
|
|
46
|
+
title = cs.select(0, "css:h1.title")
|
|
47
|
+
author = cs.select(0, "xpath://span[@class='author']/text()")
|
|
48
|
+
vin = cs.select(0, r"regex:[A-HJ-NPR-Z0-9]{17}")
|
|
49
|
+
name = cs.select(0, "json:data.products[0].name")
|
|
50
|
+
|
|
51
|
+
# Function piping
|
|
52
|
+
clean = cs.select(0, "css:.price >> trim >> uppercase()")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Install
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install chadselect
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Query Syntax
|
|
62
|
+
|
|
63
|
+
Queries use a `engine:expression` prefix:
|
|
64
|
+
|
|
65
|
+
| Prefix | Engine | Best For |
|
|
66
|
+
|--------|--------|----------|
|
|
67
|
+
| `css:` | CSS Selectors (selectolax) | HTML element selection |
|
|
68
|
+
| `xpath:` | XPath 1.0 (lxml) | Complex HTML/XML traversal |
|
|
69
|
+
| `regex:` | Regular Expressions (re) | Pattern matching on raw text |
|
|
70
|
+
| `json:` | JMESPath (jmespath) | JSON field extraction |
|
|
71
|
+
|
|
72
|
+
No prefix defaults to regex.
|
|
73
|
+
|
|
74
|
+
## Function Piping
|
|
75
|
+
|
|
76
|
+
Chain text transformations with `>>`:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
cs.select(0, "css:.price >> trim >> substring-after('$') >> uppercase()")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Available functions: `trim`, `uppercase()`, `lowercase()`, `normalize-space()`,
|
|
83
|
+
`substring-after('delim')`, `substring-before('delim')`, `substring(start, len)`,
|
|
84
|
+
`replace('old', 'new')`, `get-attr('name')`.
|
|
85
|
+
|
|
86
|
+
## API
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
cs = ChadSelect()
|
|
90
|
+
|
|
91
|
+
# Load content
|
|
92
|
+
cs.add_html(html_string)
|
|
93
|
+
cs.add_json(json_string)
|
|
94
|
+
cs.add_text(plain_text)
|
|
95
|
+
|
|
96
|
+
# Query (index: 0=first, -1=all)
|
|
97
|
+
results = cs.query(-1, "css:.price") # List[str] — all matches
|
|
98
|
+
value = cs.select(0, "css:.price") # str — first match or ""
|
|
99
|
+
|
|
100
|
+
# Multi-query
|
|
101
|
+
first_hit = cs.select_first([(0, "css:#id"), (0, "xpath://fallback")])
|
|
102
|
+
combined = cs.select_many([(-1, "css:.a"), (-1, "css:.b")])
|
|
103
|
+
|
|
104
|
+
# Batch (fastest for many fields)
|
|
105
|
+
results = cs.query_batch([(-1, "css:.title"), (-1, "json:data.name")])
|
|
106
|
+
|
|
107
|
+
# With validators
|
|
108
|
+
results = cs.select_where(0, "css:.vin", lambda v: len(v) == 17)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
MIT
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# ChadSelect
|
|
2
|
+
|
|
3
|
+
Unified data extraction — CSS, XPath, Regex, and JMESPath behind one query interface.
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
from chadselect import ChadSelect
|
|
7
|
+
|
|
8
|
+
cs = ChadSelect()
|
|
9
|
+
cs.add_html(html)
|
|
10
|
+
cs.add_json(json_str)
|
|
11
|
+
|
|
12
|
+
# One syntax, four engines
|
|
13
|
+
title = cs.select(0, "css:h1.title")
|
|
14
|
+
author = cs.select(0, "xpath://span[@class='author']/text()")
|
|
15
|
+
vin = cs.select(0, r"regex:[A-HJ-NPR-Z0-9]{17}")
|
|
16
|
+
name = cs.select(0, "json:data.products[0].name")
|
|
17
|
+
|
|
18
|
+
# Function piping
|
|
19
|
+
clean = cs.select(0, "css:.price >> trim >> uppercase()")
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install chadselect
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Query Syntax
|
|
29
|
+
|
|
30
|
+
Queries use a `engine:expression` prefix:
|
|
31
|
+
|
|
32
|
+
| Prefix | Engine | Best For |
|
|
33
|
+
|--------|--------|----------|
|
|
34
|
+
| `css:` | CSS Selectors (selectolax) | HTML element selection |
|
|
35
|
+
| `xpath:` | XPath 1.0 (lxml) | Complex HTML/XML traversal |
|
|
36
|
+
| `regex:` | Regular Expressions (re) | Pattern matching on raw text |
|
|
37
|
+
| `json:` | JMESPath (jmespath) | JSON field extraction |
|
|
38
|
+
|
|
39
|
+
No prefix defaults to regex.
|
|
40
|
+
|
|
41
|
+
## Function Piping
|
|
42
|
+
|
|
43
|
+
Chain text transformations with `>>`:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
cs.select(0, "css:.price >> trim >> substring-after('$') >> uppercase()")
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Available functions: `trim`, `uppercase()`, `lowercase()`, `normalize-space()`,
|
|
50
|
+
`substring-after('delim')`, `substring-before('delim')`, `substring(start, len)`,
|
|
51
|
+
`replace('old', 'new')`, `get-attr('name')`.
|
|
52
|
+
|
|
53
|
+
## API
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
cs = ChadSelect()
|
|
57
|
+
|
|
58
|
+
# Load content
|
|
59
|
+
cs.add_html(html_string)
|
|
60
|
+
cs.add_json(json_string)
|
|
61
|
+
cs.add_text(plain_text)
|
|
62
|
+
|
|
63
|
+
# Query (index: 0=first, -1=all)
|
|
64
|
+
results = cs.query(-1, "css:.price") # List[str] — all matches
|
|
65
|
+
value = cs.select(0, "css:.price") # str — first match or ""
|
|
66
|
+
|
|
67
|
+
# Multi-query
|
|
68
|
+
first_hit = cs.select_first([(0, "css:#id"), (0, "xpath://fallback")])
|
|
69
|
+
combined = cs.select_many([(-1, "css:.a"), (-1, "css:.b")])
|
|
70
|
+
|
|
71
|
+
# Batch (fastest for many fields)
|
|
72
|
+
results = cs.query_batch([(-1, "css:.title"), (-1, "json:data.name")])
|
|
73
|
+
|
|
74
|
+
# With validators
|
|
75
|
+
results = cs.select_where(0, "css:.vin", lambda v: len(v) == 17)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
MIT
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chadselect"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Unified data extraction — CSS, XPath, Regex, and JMESPath behind one query interface."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Mark Jackson"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["scraping", "css", "xpath", "regex", "jmespath", "extraction", "parsing"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Programming Language :: Python :: 3.14",
|
|
27
|
+
"Programming Language :: Python :: 3.15",
|
|
28
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
29
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
30
|
+
"Typing :: Typed",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"selectolax>=0.3.21",
|
|
34
|
+
"lxml>=5.0",
|
|
35
|
+
"jmespath>=1.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/markjacksoncerberus/chadselect"
|
|
40
|
+
Repository = "https://github.com/markjacksoncerberus/chadselect"
|
|
41
|
+
Issues = "https://github.com/markjacksoncerberus/chadselect/issues"
|
|
42
|
+
|
|
43
|
+
[project.optional-dependencies]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=7.0",
|
|
46
|
+
"pytest-asyncio>=0.21",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["src/chadselect"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChadSelect — Unified data extraction.
|
|
3
|
+
|
|
4
|
+
CSS Selectors, XPath 1.0, Regex, and JMESPath behind one query interface
|
|
5
|
+
with chainable post-processing functions.
|
|
6
|
+
|
|
7
|
+
Usage::
|
|
8
|
+
|
|
9
|
+
from chadselect import ChadSelect
|
|
10
|
+
|
|
11
|
+
cs = ChadSelect()
|
|
12
|
+
cs.add_html('<span class="price">$49.99</span>')
|
|
13
|
+
price = cs.select(0, "css:.price")
|
|
14
|
+
# "$49.99"
|
|
15
|
+
|
|
16
|
+
Query prefixes::
|
|
17
|
+
|
|
18
|
+
css: → CSS Selectors (selectolax/lexbor)
|
|
19
|
+
xpath: → XPath 1.0 (lxml/libxml2)
|
|
20
|
+
json: → JMESPath
|
|
21
|
+
regex: → Regex (re stdlib)
|
|
22
|
+
(no prefix) → Regex (default)
|
|
23
|
+
|
|
24
|
+
Post-processing functions (pipe with >>)::
|
|
25
|
+
|
|
26
|
+
cs.select(0, "css:.price >> normalize-space() >> uppercase()")
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from chadselect._chadselect import ChadSelect
|
|
30
|
+
from chadselect._query import FUNCTION_PIPE, QueryType, parse_query
|
|
31
|
+
from chadselect._functions import supported_text_functions
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"ChadSelect",
|
|
35
|
+
"FUNCTION_PIPE",
|
|
36
|
+
"QueryType",
|
|
37
|
+
"parse_query",
|
|
38
|
+
"supported_text_functions",
|
|
39
|
+
]
|
|
40
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ChadSelect — the main extraction class.
|
|
3
|
+
|
|
4
|
+
API-compatible with the Rust ``chadselect`` crate.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Callable, List, Optional, Sequence, Tuple
|
|
11
|
+
|
|
12
|
+
from chadselect._query import ContentType, QueryType, parse_query, is_query_compatible
|
|
13
|
+
from chadselect.engine import css as css_engine
|
|
14
|
+
from chadselect.engine import xpath as xpath_engine
|
|
15
|
+
from chadselect.engine import regex as regex_engine
|
|
16
|
+
from chadselect.engine import json as json_engine
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _default_valid(s: str) -> bool:
|
|
22
|
+
"""Default validator — non-empty, non-whitespace."""
|
|
23
|
+
return bool(s and s.strip())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _ContentItem:
|
|
27
|
+
"""Internal content item with type tag."""
|
|
28
|
+
|
|
29
|
+
__slots__ = ("content", "content_type")
|
|
30
|
+
|
|
31
|
+
def __init__(self, content: str, content_type: ContentType) -> None:
|
|
32
|
+
self.content = content
|
|
33
|
+
self.content_type = content_type
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ChadSelect:
|
|
37
|
+
"""Unified data extraction — CSS, XPath, Regex, and JMESPath.
|
|
38
|
+
|
|
39
|
+
Load content, then query with a prefixed query string::
|
|
40
|
+
|
|
41
|
+
cs = ChadSelect()
|
|
42
|
+
cs.add_html('<span class="price">$49.99</span>')
|
|
43
|
+
price = cs.select(0, "css:.price") # "$49.99"
|
|
44
|
+
|
|
45
|
+
Query prefixes:
|
|
46
|
+
- ``css:`` → CSS Selectors (selectolax / lexbor)
|
|
47
|
+
- ``xpath:`` → XPath 1.0 (lxml / libxml2)
|
|
48
|
+
- ``json:`` → JMESPath
|
|
49
|
+
- ``regex:`` → Python ``re``
|
|
50
|
+
- *(none)* → Regex (default)
|
|
51
|
+
|
|
52
|
+
Post-processing via ``>>``::
|
|
53
|
+
|
|
54
|
+
cs.select(0, "css:.price >> normalize-space() >> uppercase()")
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
__slots__ = ("_content_list",)
|
|
58
|
+
|
|
59
|
+
def __init__(self) -> None:
|
|
60
|
+
self._content_list: List[_ContentItem] = []
|
|
61
|
+
|
|
62
|
+
# ── Content management ──────────────────────────────────────────────
|
|
63
|
+
|
|
64
|
+
def add_text(self, content: str) -> None:
|
|
65
|
+
"""Add plain text content."""
|
|
66
|
+
self._content_list.append(_ContentItem(content, ContentType.TEXT))
|
|
67
|
+
|
|
68
|
+
def add_html(self, content: str) -> None:
|
|
69
|
+
"""Add HTML content (compatible with CSS, XPath, and Regex)."""
|
|
70
|
+
self._content_list.append(_ContentItem(content, ContentType.HTML))
|
|
71
|
+
|
|
72
|
+
def add_json(self, content: str) -> None:
|
|
73
|
+
"""Add JSON content (compatible with JMESPath and Regex)."""
|
|
74
|
+
self._content_list.append(_ContentItem(content, ContentType.JSON))
|
|
75
|
+
|
|
76
|
+
def content_count(self) -> int:
|
|
77
|
+
"""Return the number of loaded content items."""
|
|
78
|
+
return len(self._content_list)
|
|
79
|
+
|
|
80
|
+
def clear(self) -> None:
|
|
81
|
+
"""Remove all loaded content."""
|
|
82
|
+
self._content_list.clear()
|
|
83
|
+
|
|
84
|
+
# ── Querying ────────────────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
def query(self, index: int, query_str: str) -> List[str]:
|
|
87
|
+
"""Query all loaded content and return matching results.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
index: ``-1`` returns **all** matches. ``>= 0`` returns the
|
|
91
|
+
match at that position (or empty list if out of bounds).
|
|
92
|
+
query_str: Prefixed query string (e.g. ``"css:.price"``).
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of matched strings. Never raises — invalid queries or
|
|
96
|
+
out-of-bounds indices return ``[]``.
|
|
97
|
+
"""
|
|
98
|
+
query_type, expression = parse_query(query_str)
|
|
99
|
+
|
|
100
|
+
all_results: List[str] = []
|
|
101
|
+
|
|
102
|
+
for item in self._content_list:
|
|
103
|
+
if not is_query_compatible(query_type, item.content_type):
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
if query_type == QueryType.CSS:
|
|
107
|
+
results = css_engine.process(expression, item.content)
|
|
108
|
+
elif query_type == QueryType.XPATH:
|
|
109
|
+
results = xpath_engine.process(expression, item.content)
|
|
110
|
+
elif query_type == QueryType.REGEX:
|
|
111
|
+
results = regex_engine.process(expression, item.content)
|
|
112
|
+
elif query_type == QueryType.JSON:
|
|
113
|
+
results = json_engine.process(expression, item.content)
|
|
114
|
+
else:
|
|
115
|
+
results = []
|
|
116
|
+
|
|
117
|
+
all_results.extend(results)
|
|
118
|
+
|
|
119
|
+
return _select_by_index(all_results, index)
|
|
120
|
+
|
|
121
|
+
def select(self, index: int, query_str: str) -> str:
|
|
122
|
+
"""Return a single result string (the first match), or ``""``.
|
|
123
|
+
|
|
124
|
+
A result is valid when it is non-empty and non-whitespace.
|
|
125
|
+
"""
|
|
126
|
+
return self.select_where(index, query_str, _default_valid)
|
|
127
|
+
|
|
128
|
+
def select_where(
|
|
129
|
+
self,
|
|
130
|
+
index: int,
|
|
131
|
+
query_str: str,
|
|
132
|
+
valid: Callable[[str], bool],
|
|
133
|
+
) -> str:
|
|
134
|
+
"""Like :meth:`select` but with a custom validity check.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
valid: Receives each candidate string, returns ``True`` to accept.
|
|
138
|
+
"""
|
|
139
|
+
result = self.query(index, query_str)
|
|
140
|
+
if result and valid(result[0]):
|
|
141
|
+
return result[0]
|
|
142
|
+
return ""
|
|
143
|
+
|
|
144
|
+
def select_first(
|
|
145
|
+
self, queries: Sequence[Tuple[int, str]]
|
|
146
|
+
) -> List[str]:
|
|
147
|
+
"""Try multiple queries in order, return the first valid result set.
|
|
148
|
+
|
|
149
|
+
A result set is valid when all its elements are non-empty and
|
|
150
|
+
non-whitespace.
|
|
151
|
+
"""
|
|
152
|
+
return self.select_first_where(queries, _default_valid)
|
|
153
|
+
|
|
154
|
+
def select_first_where(
|
|
155
|
+
self,
|
|
156
|
+
queries: Sequence[Tuple[int, str]],
|
|
157
|
+
valid: Callable[[str], bool],
|
|
158
|
+
) -> List[str]:
|
|
159
|
+
"""Like :meth:`select_first` but with a custom validity check."""
|
|
160
|
+
for index, query_str in queries:
|
|
161
|
+
result = self.query(index, query_str)
|
|
162
|
+
if result and all(valid(r) for r in result):
|
|
163
|
+
return result
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
def select_many(
|
|
167
|
+
self, queries: Sequence[Tuple[int, str]]
|
|
168
|
+
) -> List[str]:
|
|
169
|
+
"""Run multiple queries and return combined unique results."""
|
|
170
|
+
return self.select_many_where(queries, _default_valid)
|
|
171
|
+
|
|
172
|
+
def select_many_where(
|
|
173
|
+
self,
|
|
174
|
+
queries: Sequence[Tuple[int, str]],
|
|
175
|
+
valid: Callable[[str], bool],
|
|
176
|
+
) -> List[str]:
|
|
177
|
+
"""Like :meth:`select_many` but with a custom validity check."""
|
|
178
|
+
seen: set[str] = set()
|
|
179
|
+
out: List[str] = []
|
|
180
|
+
for index, query_str in queries:
|
|
181
|
+
for r in self.query(index, query_str):
|
|
182
|
+
if valid(r) and r not in seen:
|
|
183
|
+
seen.add(r)
|
|
184
|
+
out.append(r)
|
|
185
|
+
return out
|
|
186
|
+
|
|
187
|
+
def query_batch(
|
|
188
|
+
self, queries: Sequence[Tuple[int, str]]
|
|
189
|
+
) -> List[List[str]]:
|
|
190
|
+
"""Execute multiple queries in one call.
|
|
191
|
+
|
|
192
|
+
Returns a list of result lists, one per input query, in order.
|
|
193
|
+
This is the most efficient way to extract many fields.
|
|
194
|
+
"""
|
|
195
|
+
return [self.query(index, q) for index, q in queries]
|
|
196
|
+
|
|
197
|
+
# ── Dunder ──────────────────────────────────────────────────────────
|
|
198
|
+
|
|
199
|
+
def __repr__(self) -> str:
|
|
200
|
+
return f"ChadSelect(content_count={self.content_count()})"
|
|
201
|
+
|
|
202
|
+
def __len__(self) -> int:
|
|
203
|
+
return self.content_count()
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _select_by_index(results: List[str], index: int) -> List[str]:
|
|
207
|
+
"""Select results by index — ``-1`` means 'all'."""
|
|
208
|
+
if index == -1:
|
|
209
|
+
return results
|
|
210
|
+
if index >= 0:
|
|
211
|
+
if index < len(results):
|
|
212
|
+
return [results[index]]
|
|
213
|
+
logger.warning(
|
|
214
|
+
"Index %d out of range (have %d results)", index, len(results)
|
|
215
|
+
)
|
|
216
|
+
return []
|
|
217
|
+
logger.warning("Invalid index: %d", index)
|
|
218
|
+
return []
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Post-processing text functions — shared by all engines.
|
|
3
|
+
|
|
4
|
+
Functions are chained using the ``>>`` delimiter after a selector expression::
|
|
5
|
+
|
|
6
|
+
css:.price >> normalize-space() >> uppercase()
|
|
7
|
+
xpath://div/text() >> substring-after('VIN: ') >> substring(0, 3)
|
|
8
|
+
|
|
9
|
+
Mirrors the Rust crate's ``functions.rs`` exactly.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import List, Tuple
|
|
16
|
+
|
|
17
|
+
from chadselect._query import FUNCTION_PIPE
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def supported_text_functions() -> List[str]:
|
|
21
|
+
"""Return the list of all supported text function signatures."""
|
|
22
|
+
return [
|
|
23
|
+
"normalize-space()",
|
|
24
|
+
"trim()",
|
|
25
|
+
"uppercase()",
|
|
26
|
+
"lowercase()",
|
|
27
|
+
"substring(start, length)",
|
|
28
|
+
"substring-after('delimiter')",
|
|
29
|
+
"substring-before('delimiter')",
|
|
30
|
+
"replace('find', 'replace')",
|
|
31
|
+
"get-attr('attribute')",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def split_functions(input_str: str) -> Tuple[str, str]:
|
|
36
|
+
"""Split ``expression >> func1() >> func2()`` into ``(expression, func_chain_str)``.
|
|
37
|
+
|
|
38
|
+
Returns ``(expression, "")`` if no ``>>`` pipe is present.
|
|
39
|
+
"""
|
|
40
|
+
pos = input_str.find(FUNCTION_PIPE)
|
|
41
|
+
if pos == -1:
|
|
42
|
+
return input_str.strip(), ""
|
|
43
|
+
return input_str[:pos].strip(), input_str[pos + len(FUNCTION_PIPE):]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def parse_and_apply(results: List[str], func_chain_str: str) -> List[str]:
|
|
47
|
+
"""Parse a function chain string and apply it to results."""
|
|
48
|
+
if not func_chain_str.strip():
|
|
49
|
+
return results
|
|
50
|
+
|
|
51
|
+
for func_str in func_chain_str.split(FUNCTION_PIPE):
|
|
52
|
+
func_str = func_str.strip()
|
|
53
|
+
if not func_str:
|
|
54
|
+
continue
|
|
55
|
+
results = _apply_one(results, func_str)
|
|
56
|
+
# Filter empty results after each step (matches Rust behavior)
|
|
57
|
+
results = [r for r in results if r]
|
|
58
|
+
return results
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _apply_one(results: List[str], func_str: str) -> List[str]:
|
|
62
|
+
"""Apply a single function to all results."""
|
|
63
|
+
paren = func_str.find("(")
|
|
64
|
+
if paren == -1:
|
|
65
|
+
# Shorthand without parens — e.g. "trim"
|
|
66
|
+
name = func_str.strip()
|
|
67
|
+
args_str = ""
|
|
68
|
+
else:
|
|
69
|
+
name = func_str[:paren].strip()
|
|
70
|
+
end = func_str.rfind(")")
|
|
71
|
+
args_str = func_str[paren + 1: end if end != -1 else len(func_str)]
|
|
72
|
+
|
|
73
|
+
if name == "normalize-space":
|
|
74
|
+
return [re.sub(r"\s+", " ", s).strip() for s in results]
|
|
75
|
+
|
|
76
|
+
if name == "trim":
|
|
77
|
+
return [s.strip() for s in results]
|
|
78
|
+
|
|
79
|
+
if name == "uppercase":
|
|
80
|
+
return [s.upper() for s in results]
|
|
81
|
+
|
|
82
|
+
if name == "lowercase":
|
|
83
|
+
return [s.lower() for s in results]
|
|
84
|
+
|
|
85
|
+
if name == "substring":
|
|
86
|
+
args = [a.strip() for a in args_str.split(",")]
|
|
87
|
+
if len(args) >= 2:
|
|
88
|
+
try:
|
|
89
|
+
start, length = int(args[0]), int(args[1])
|
|
90
|
+
return [s[start: start + length] for s in results]
|
|
91
|
+
except ValueError:
|
|
92
|
+
return results
|
|
93
|
+
return results
|
|
94
|
+
|
|
95
|
+
if name == "substring-after":
|
|
96
|
+
delim = args_str.strip().strip("\"'")
|
|
97
|
+
out = []
|
|
98
|
+
for s in results:
|
|
99
|
+
idx = s.find(delim)
|
|
100
|
+
out.append(s[idx + len(delim):] if idx != -1 else "")
|
|
101
|
+
# Filter out empty results (matches Rust behavior)
|
|
102
|
+
return [r for r in out if r]
|
|
103
|
+
|
|
104
|
+
if name == "substring-before":
|
|
105
|
+
delim = args_str.strip().strip("\"'")
|
|
106
|
+
out = []
|
|
107
|
+
for s in results:
|
|
108
|
+
idx = s.find(delim)
|
|
109
|
+
out.append(s[:idx] if idx != -1 else s)
|
|
110
|
+
return out
|
|
111
|
+
|
|
112
|
+
if name == "replace":
|
|
113
|
+
args = _parse_two_string_args(args_str)
|
|
114
|
+
if args:
|
|
115
|
+
find, repl = args
|
|
116
|
+
return [s.replace(find, repl) for s in results]
|
|
117
|
+
return results
|
|
118
|
+
|
|
119
|
+
if name == "get-attr":
|
|
120
|
+
# Handled specially by the CSS engine — pass through here
|
|
121
|
+
# (the attr name is extracted at the engine level)
|
|
122
|
+
return results
|
|
123
|
+
|
|
124
|
+
# Unknown function — skip silently
|
|
125
|
+
return results
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _parse_two_string_args(args_str: str) -> Tuple[str, str] | None:
|
|
129
|
+
"""Parse ``'find', 'replace'`` from an argument string."""
|
|
130
|
+
# Match 'x', 'y' or "x", "y"
|
|
131
|
+
m = re.match(r"""['"](.*?)['"],\s*['"](.*?)['"]""", args_str.strip())
|
|
132
|
+
if m:
|
|
133
|
+
return m.group(1), m.group(2)
|
|
134
|
+
return None
|