chadselect 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chadselect/__init__.py ADDED
@@ -0,0 +1,40 @@
1
+ """
2
+ ChadSelect — Unified data extraction.
3
+
4
+ CSS Selectors, XPath 1.0, Regex, and JMESPath behind one query interface
5
+ with chainable post-processing functions.
6
+
7
+ Usage::
8
+
9
+ from chadselect import ChadSelect
10
+
11
+ cs = ChadSelect()
12
+ cs.add_html('<span class="price">$49.99</span>')
13
+ price = cs.select(0, "css:.price")
14
+ # "$49.99"
15
+
16
+ Query prefixes::
17
+
18
+ css: → CSS Selectors (selectolax/lexbor)
19
+ xpath: → XPath 1.0 (lxml/libxml2)
20
+ json: → JMESPath
21
+ regex: → Regex (re stdlib)
22
+ (no prefix) → Regex (default)
23
+
24
+ Post-processing functions (pipe with >>)::
25
+
26
+ cs.select(0, "css:.price >> normalize-space() >> uppercase()")
27
+ """
28
+
29
+ from chadselect._chadselect import ChadSelect
30
+ from chadselect._query import FUNCTION_PIPE, QueryType, parse_query
31
+ from chadselect._functions import supported_text_functions
32
+
33
+ __all__ = [
34
+ "ChadSelect",
35
+ "FUNCTION_PIPE",
36
+ "QueryType",
37
+ "parse_query",
38
+ "supported_text_functions",
39
+ ]
40
+ __version__ = "0.2.0"
@@ -0,0 +1,218 @@
1
+ """
2
+ ChadSelect — the main extraction class.
3
+
4
+ API-compatible with the Rust ``chadselect`` crate.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any, Callable, List, Optional, Sequence, Tuple
11
+
12
+ from chadselect._query import ContentType, QueryType, parse_query, is_query_compatible
13
+ from chadselect.engine import css as css_engine
14
+ from chadselect.engine import xpath as xpath_engine
15
+ from chadselect.engine import regex as regex_engine
16
+ from chadselect.engine import json as json_engine
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _default_valid(s: str) -> bool:
22
+ """Default validator — non-empty, non-whitespace."""
23
+ return bool(s and s.strip())
24
+
25
+
26
+ class _ContentItem:
27
+ """Internal content item with type tag."""
28
+
29
+ __slots__ = ("content", "content_type")
30
+
31
+ def __init__(self, content: str, content_type: ContentType) -> None:
32
+ self.content = content
33
+ self.content_type = content_type
34
+
35
+
36
+ class ChadSelect:
37
+ """Unified data extraction — CSS, XPath, Regex, and JMESPath.
38
+
39
+ Load content, then query with a prefixed query string::
40
+
41
+ cs = ChadSelect()
42
+ cs.add_html('<span class="price">$49.99</span>')
43
+ price = cs.select(0, "css:.price") # "$49.99"
44
+
45
+ Query prefixes:
46
+ - ``css:`` → CSS Selectors (selectolax / lexbor)
47
+ - ``xpath:`` → XPath 1.0 (lxml / libxml2)
48
+ - ``json:`` → JMESPath
49
+ - ``regex:`` → Python ``re``
50
+ - *(none)* → Regex (default)
51
+
52
+ Post-processing via ``>>``::
53
+
54
+ cs.select(0, "css:.price >> normalize-space() >> uppercase()")
55
+ """
56
+
57
+ __slots__ = ("_content_list",)
58
+
59
+ def __init__(self) -> None:
60
+ self._content_list: List[_ContentItem] = []
61
+
62
+ # ── Content management ──────────────────────────────────────────────
63
+
64
+ def add_text(self, content: str) -> None:
65
+ """Add plain text content."""
66
+ self._content_list.append(_ContentItem(content, ContentType.TEXT))
67
+
68
+ def add_html(self, content: str) -> None:
69
+ """Add HTML content (compatible with CSS, XPath, and Regex)."""
70
+ self._content_list.append(_ContentItem(content, ContentType.HTML))
71
+
72
+ def add_json(self, content: str) -> None:
73
+ """Add JSON content (compatible with JMESPath and Regex)."""
74
+ self._content_list.append(_ContentItem(content, ContentType.JSON))
75
+
76
+ def content_count(self) -> int:
77
+ """Return the number of loaded content items."""
78
+ return len(self._content_list)
79
+
80
+ def clear(self) -> None:
81
+ """Remove all loaded content."""
82
+ self._content_list.clear()
83
+
84
+ # ── Querying ────────────────────────────────────────────────────────
85
+
86
+ def query(self, index: int, query_str: str) -> List[str]:
87
+ """Query all loaded content and return matching results.
88
+
89
+ Args:
90
+ index: ``-1`` returns **all** matches. ``>= 0`` returns the
91
+ match at that position (or empty list if out of bounds).
92
+ query_str: Prefixed query string (e.g. ``"css:.price"``).
93
+
94
+ Returns:
95
+ List of matched strings. Never raises — invalid queries or
96
+ out-of-bounds indices return ``[]``.
97
+ """
98
+ query_type, expression = parse_query(query_str)
99
+
100
+ all_results: List[str] = []
101
+
102
+ for item in self._content_list:
103
+ if not is_query_compatible(query_type, item.content_type):
104
+ continue
105
+
106
+ if query_type == QueryType.CSS:
107
+ results = css_engine.process(expression, item.content)
108
+ elif query_type == QueryType.XPATH:
109
+ results = xpath_engine.process(expression, item.content)
110
+ elif query_type == QueryType.REGEX:
111
+ results = regex_engine.process(expression, item.content)
112
+ elif query_type == QueryType.JSON:
113
+ results = json_engine.process(expression, item.content)
114
+ else:
115
+ results = []
116
+
117
+ all_results.extend(results)
118
+
119
+ return _select_by_index(all_results, index)
120
+
121
+ def select(self, index: int, query_str: str) -> str:
122
+ """Return a single result string (the first match), or ``""``.
123
+
124
+ A result is valid when it is non-empty and non-whitespace.
125
+ """
126
+ return self.select_where(index, query_str, _default_valid)
127
+
128
+ def select_where(
129
+ self,
130
+ index: int,
131
+ query_str: str,
132
+ valid: Callable[[str], bool],
133
+ ) -> str:
134
+ """Like :meth:`select` but with a custom validity check.
135
+
136
+ Args:
137
+ valid: Receives each candidate string, returns ``True`` to accept.
138
+ """
139
+ result = self.query(index, query_str)
140
+ if result and valid(result[0]):
141
+ return result[0]
142
+ return ""
143
+
144
+ def select_first(
145
+ self, queries: Sequence[Tuple[int, str]]
146
+ ) -> List[str]:
147
+ """Try multiple queries in order, return the first valid result set.
148
+
149
+ A result set is valid when all its elements are non-empty and
150
+ non-whitespace.
151
+ """
152
+ return self.select_first_where(queries, _default_valid)
153
+
154
+ def select_first_where(
155
+ self,
156
+ queries: Sequence[Tuple[int, str]],
157
+ valid: Callable[[str], bool],
158
+ ) -> List[str]:
159
+ """Like :meth:`select_first` but with a custom validity check."""
160
+ for index, query_str in queries:
161
+ result = self.query(index, query_str)
162
+ if result and all(valid(r) for r in result):
163
+ return result
164
+ return []
165
+
166
+ def select_many(
167
+ self, queries: Sequence[Tuple[int, str]]
168
+ ) -> List[str]:
169
+ """Run multiple queries and return combined unique results."""
170
+ return self.select_many_where(queries, _default_valid)
171
+
172
+ def select_many_where(
173
+ self,
174
+ queries: Sequence[Tuple[int, str]],
175
+ valid: Callable[[str], bool],
176
+ ) -> List[str]:
177
+ """Like :meth:`select_many` but with a custom validity check."""
178
+ seen: set[str] = set()
179
+ out: List[str] = []
180
+ for index, query_str in queries:
181
+ for r in self.query(index, query_str):
182
+ if valid(r) and r not in seen:
183
+ seen.add(r)
184
+ out.append(r)
185
+ return out
186
+
187
+ def query_batch(
188
+ self, queries: Sequence[Tuple[int, str]]
189
+ ) -> List[List[str]]:
190
+ """Execute multiple queries in one call.
191
+
192
+ Returns a list of result lists, one per input query, in order.
193
+ This is the most efficient way to extract many fields.
194
+ """
195
+ return [self.query(index, q) for index, q in queries]
196
+
197
+ # ── Dunder ──────────────────────────────────────────────────────────
198
+
199
+ def __repr__(self) -> str:
200
+ return f"ChadSelect(content_count={self.content_count()})"
201
+
202
+ def __len__(self) -> int:
203
+ return self.content_count()
204
+
205
+
206
+ def _select_by_index(results: List[str], index: int) -> List[str]:
207
+ """Select results by index — ``-1`` means 'all'."""
208
+ if index == -1:
209
+ return results
210
+ if index >= 0:
211
+ if index < len(results):
212
+ return [results[index]]
213
+ logger.warning(
214
+ "Index %d out of range (have %d results)", index, len(results)
215
+ )
216
+ return []
217
+ logger.warning("Invalid index: %d", index)
218
+ return []
@@ -0,0 +1,134 @@
1
+ """
2
+ Post-processing text functions — shared by all engines.
3
+
4
+ Functions are chained using the ``>>`` delimiter after a selector expression::
5
+
6
+ css:.price >> normalize-space() >> uppercase()
7
+ xpath://div/text() >> substring-after('VIN: ') >> substring(0, 3)
8
+
9
+ Mirrors the Rust crate's ``functions.rs`` exactly.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from typing import List, Tuple
16
+
17
+ from chadselect._query import FUNCTION_PIPE
18
+
19
+
20
+ def supported_text_functions() -> List[str]:
21
+ """Return the list of all supported text function signatures."""
22
+ return [
23
+ "normalize-space()",
24
+ "trim()",
25
+ "uppercase()",
26
+ "lowercase()",
27
+ "substring(start, length)",
28
+ "substring-after('delimiter')",
29
+ "substring-before('delimiter')",
30
+ "replace('find', 'replace')",
31
+ "get-attr('attribute')",
32
+ ]
33
+
34
+
35
+ def split_functions(input_str: str) -> Tuple[str, str]:
36
+ """Split ``expression >> func1() >> func2()`` into ``(expression, func_chain_str)``.
37
+
38
+ Returns ``(expression, "")`` if no ``>>`` pipe is present.
39
+ """
40
+ pos = input_str.find(FUNCTION_PIPE)
41
+ if pos == -1:
42
+ return input_str.strip(), ""
43
+ return input_str[:pos].strip(), input_str[pos + len(FUNCTION_PIPE):]
44
+
45
+
46
+ def parse_and_apply(results: List[str], func_chain_str: str) -> List[str]:
47
+ """Parse a function chain string and apply it to results."""
48
+ if not func_chain_str.strip():
49
+ return results
50
+
51
+ for func_str in func_chain_str.split(FUNCTION_PIPE):
52
+ func_str = func_str.strip()
53
+ if not func_str:
54
+ continue
55
+ results = _apply_one(results, func_str)
56
+ # Filter empty results after each step (matches Rust behavior)
57
+ results = [r for r in results if r]
58
+ return results
59
+
60
+
61
+ def _apply_one(results: List[str], func_str: str) -> List[str]:
62
+ """Apply a single function to all results."""
63
+ paren = func_str.find("(")
64
+ if paren == -1:
65
+ # Shorthand without parens — e.g. "trim"
66
+ name = func_str.strip()
67
+ args_str = ""
68
+ else:
69
+ name = func_str[:paren].strip()
70
+ end = func_str.rfind(")")
71
+ args_str = func_str[paren + 1: end if end != -1 else len(func_str)]
72
+
73
+ if name == "normalize-space":
74
+ return [re.sub(r"\s+", " ", s).strip() for s in results]
75
+
76
+ if name == "trim":
77
+ return [s.strip() for s in results]
78
+
79
+ if name == "uppercase":
80
+ return [s.upper() for s in results]
81
+
82
+ if name == "lowercase":
83
+ return [s.lower() for s in results]
84
+
85
+ if name == "substring":
86
+ args = [a.strip() for a in args_str.split(",")]
87
+ if len(args) >= 2:
88
+ try:
89
+ start, length = int(args[0]), int(args[1])
90
+ return [s[start: start + length] for s in results]
91
+ except ValueError:
92
+ return results
93
+ return results
94
+
95
+ if name == "substring-after":
96
+ delim = args_str.strip().strip("\"'")
97
+ out = []
98
+ for s in results:
99
+ idx = s.find(delim)
100
+ out.append(s[idx + len(delim):] if idx != -1 else "")
101
+ # Filter out empty results (matches Rust behavior)
102
+ return [r for r in out if r]
103
+
104
+ if name == "substring-before":
105
+ delim = args_str.strip().strip("\"'")
106
+ out = []
107
+ for s in results:
108
+ idx = s.find(delim)
109
+ out.append(s[:idx] if idx != -1 else s)
110
+ return out
111
+
112
+ if name == "replace":
113
+ args = _parse_two_string_args(args_str)
114
+ if args:
115
+ find, repl = args
116
+ return [s.replace(find, repl) for s in results]
117
+ return results
118
+
119
+ if name == "get-attr":
120
+ # Handled specially by the CSS engine — pass through here
121
+ # (the attr name is extracted at the engine level)
122
+ return results
123
+
124
+ # Unknown function — skip silently
125
+ return results
126
+
127
+
128
+ def _parse_two_string_args(args_str: str) -> Tuple[str, str] | None:
129
+ """Parse ``'find', 'replace'`` from an argument string."""
130
+ # Match 'x', 'y' or "x", "y"
131
+ m = re.match(r"""['"](.*?)['"],\s*['"](.*?)['"]""", args_str.strip())
132
+ if m:
133
+ return m.group(1), m.group(2)
134
+ return None
chadselect/_query.py ADDED
@@ -0,0 +1,69 @@
1
+ """
2
+ Query type parsing — prefix-based routing to the correct extraction engine.
3
+
4
+ Mirrors the Rust crate's ``query.rs`` exactly.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from enum import Enum, auto
10
+ from typing import Tuple
11
+
12
+ #: The function-pipe delimiter used to separate a selector expression from its
13
+ #: post-processing function chain.
14
+ #:
15
+ #: We use ``>>`` instead of ``|`` because ``|`` is a union operator in XPath 1.0
16
+ #: and a pipe operator in JMESPath, which would create ambiguity.
17
+ FUNCTION_PIPE: str = ">>"
18
+
19
+
20
+ class QueryType(Enum):
21
+ """Parsed query engine type."""
22
+
23
+ #: Regex pattern — works on all content types.
24
+ REGEX = auto()
25
+ #: XPath 1.0 expression — works on HTML and Text.
26
+ XPATH = auto()
27
+ #: JMESPath expression — works on JSON.
28
+ JSON = auto()
29
+ #: CSS selector — works on HTML.
30
+ CSS = auto()
31
+
32
+
33
+ #: Content type identifiers.
34
+ class ContentType(Enum):
35
+ TEXT = auto()
36
+ HTML = auto()
37
+ JSON = auto()
38
+
39
+
40
+ #: Maps query types to compatible content types.
41
+ _COMPAT = {
42
+ QueryType.REGEX: {ContentType.TEXT, ContentType.HTML, ContentType.JSON},
43
+ QueryType.XPATH: {ContentType.TEXT, ContentType.HTML},
44
+ QueryType.JSON: {ContentType.JSON},
45
+ QueryType.CSS: {ContentType.HTML},
46
+ }
47
+
48
+
49
+ def parse_query(query: str) -> Tuple[QueryType, str]:
50
+ """Parse a prefixed query string into ``(QueryType, expression)``.
51
+
52
+ Supported prefixes: ``regex:``, ``xpath:``, ``json:``, ``css:``.
53
+ No prefix defaults to Regex.
54
+ """
55
+ if query.startswith("regex:"):
56
+ return QueryType.REGEX, query[6:]
57
+ if query.startswith("json:"):
58
+ return QueryType.JSON, query[5:]
59
+ if query.startswith("xpath:"):
60
+ return QueryType.XPATH, query[6:]
61
+ if query.startswith("css:"):
62
+ return QueryType.CSS, query[4:]
63
+ # Default to regex
64
+ return QueryType.REGEX, query
65
+
66
+
67
+ def is_query_compatible(query_type: QueryType, content_type: ContentType) -> bool:
68
+ """Check whether a query type can run against a content type."""
69
+ return content_type in _COMPAT[query_type]
@@ -0,0 +1 @@
1
+ """Engine modules — one per query type."""
@@ -0,0 +1,135 @@
1
+ """CSS selector engine — powered by selectolax (lexbor).
2
+
3
+ Includes custom text pseudo-selectors that mirror the Rust crate:
4
+
5
+ - ``:has-text('...')`` — select elements whose subtree *contains* the text
6
+ - ``:contains-text('...')`` — select elements whose *own* text contains the substring
7
+ - ``:text-equals('...')`` — select elements whose text *equals* the argument
8
+ - ``:text-starts('...')`` — select elements whose text *starts with* the argument
9
+ - ``:text-ends('...')`` — select elements whose text *ends with* the argument
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import re as _re
16
+ from typing import List, Optional, Tuple
17
+
18
+ from selectolax.parser import HTMLParser, Node
19
+
20
+ from chadselect._functions import split_functions, parse_and_apply
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # ── pseudo-selector regex ────────────────────────────────────────────────────
25
+ # Matches :pseudo-name('argument') (with optional trailing selectors)
26
+ _PSEUDO_RE = _re.compile(
27
+ r":(?P<pseudo>has-text|contains-text|text-equals|text-starts|text-ends)"
28
+ r"\(['\"](?P<arg>[^'\"]*)['\"]\)"
29
+ )
30
+
31
+
32
+ def _extract_pseudo(selector: str) -> Tuple[str, Optional[str], Optional[str], str]:
33
+ """Split a selector into (base_selector, pseudo_name, pseudo_arg, trailing).
34
+
35
+ Returns ``(selector, None, None, "")`` when no custom pseudo is present.
36
+ """
37
+ m = _PSEUDO_RE.search(selector)
38
+ if not m:
39
+ return selector, None, None, ""
40
+ base = selector[: m.start()]
41
+ trailing = selector[m.end() :].strip() # e.g. " .value"
42
+ return base, m.group("pseudo"), m.group("arg"), trailing
43
+
44
+
45
+ def _node_text(node: Node) -> str:
46
+ """Full text content (subtree), stripped."""
47
+ return (node.text(strip=True) or "").strip()
48
+
49
+
50
+ def _matches_pseudo(node: Node, pseudo: str, arg: str) -> bool:
51
+ text = _node_text(node)
52
+ if pseudo == "has-text":
53
+ return arg in text
54
+ if pseudo == "contains-text":
55
+ return arg in text
56
+ if pseudo == "text-equals":
57
+ return text == arg
58
+ if pseudo == "text-starts":
59
+ return text.startswith(arg)
60
+ if pseudo == "text-ends":
61
+ return text.endswith(arg)
62
+ return False
63
+
64
+
65
+ def process(selector_with_functions: str, html: str) -> List[str]:
66
+ """Run a CSS selector against HTML content, with optional ``>>`` functions."""
67
+ selector, func_chain = split_functions(selector_with_functions)
68
+
69
+ # Check for get-attr in the function chain — need to handle before parsing
70
+ attr_name = _extract_get_attr(func_chain)
71
+ if attr_name:
72
+ func_chain = _remove_get_attr(func_chain)
73
+
74
+ tree = HTMLParser(html)
75
+
76
+ # ── custom pseudo-selector handling ──────────────────────────────────
77
+ base, pseudo, pseudo_arg, trailing = _extract_pseudo(selector)
78
+ if pseudo:
79
+ try:
80
+ candidates = tree.css(base) if base else [tree.body]
81
+ except Exception as e:
82
+ logger.warning("CSS selector failed for '%s': %s", base, e)
83
+ return []
84
+
85
+ matched_nodes: List[Node] = []
86
+ for node in candidates:
87
+ if _matches_pseudo(node, pseudo, pseudo_arg): # type: ignore[arg-type]
88
+ if trailing:
89
+ # e.g. ":has-text('Exterior:') .value" — query inside
90
+ try:
91
+ matched_nodes.extend(node.css(trailing))
92
+ except Exception:
93
+ pass
94
+ else:
95
+ matched_nodes.append(node)
96
+ nodes = matched_nodes
97
+ else:
98
+ try:
99
+ nodes = tree.css(selector)
100
+ except Exception as e:
101
+ logger.warning("CSS selector failed for '%s': %s", selector, e)
102
+ return []
103
+
104
+ # ── extract text / attributes ────────────────────────────────────────
105
+ results: List[str] = []
106
+ for node in nodes:
107
+ if attr_name:
108
+ val = node.attributes.get(attr_name, "")
109
+ if val:
110
+ results.append(val)
111
+ else:
112
+ text = node.text(strip=True)
113
+ if text:
114
+ results.append(text)
115
+
116
+ if func_chain.strip():
117
+ results = parse_and_apply(results, func_chain)
118
+
119
+ return results
120
+
121
+
122
+ def _extract_get_attr(func_chain: str) -> str | None:
123
+ """Extract the attribute name from a ``get-attr('name')`` call."""
124
+ import re
125
+ m = re.search(r"get-attr\(['\"](\w[\w-]*?)['\"]\)", func_chain)
126
+ return m.group(1) if m else None
127
+
128
+
129
+ def _remove_get_attr(func_chain: str) -> str:
130
+ """Remove ``get-attr(...)`` from a function chain string."""
131
+ import re
132
+ cleaned = re.sub(r"\s*get-attr\(['\"][\w-]+?['\"]\)\s*", " ", func_chain)
133
+ # Clean up stray >> delimiters
134
+ cleaned = re.sub(r"(>>\s*)+", ">> ", cleaned).strip().strip(">>").strip()
135
+ return cleaned
@@ -0,0 +1,66 @@
1
+ """JMESPath engine — powered by the ``jmespath`` library."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from typing import List
8
+
9
+ import jmespath
10
+
11
+ from chadselect._functions import split_functions, parse_and_apply
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def process(jmespath_with_functions: str, raw_json: str) -> List[str]:
17
+ """Run a JMESPath expression against JSON content.
18
+
19
+ Returns all result values stringified. Supports ``>>`` function piping.
20
+ """
21
+ expr, func_chain = split_functions(jmespath_with_functions)
22
+
23
+ try:
24
+ data = json.loads(raw_json)
25
+ except json.JSONDecodeError as e:
26
+ logger.warning("Invalid JSON content: %s", e)
27
+ return []
28
+
29
+ try:
30
+ result = jmespath.search(expr, data)
31
+ except Exception as e:
32
+ logger.warning("JMESPath failed for '%s': %s", expr, e)
33
+ return []
34
+
35
+ results = _to_string_list(result)
36
+
37
+ if func_chain.strip():
38
+ results = parse_and_apply(results, func_chain)
39
+
40
+ return results
41
+
42
+
43
+ def _to_string_list(value) -> List[str]:
44
+ """Convert a JMESPath result into a flat list of strings."""
45
+ if value is None:
46
+ return []
47
+ if isinstance(value, list):
48
+ out: List[str] = []
49
+ for item in value:
50
+ s = _stringify(item)
51
+ if s:
52
+ out.append(s)
53
+ return out
54
+ s = _stringify(value)
55
+ return [s] if s else []
56
+
57
+
58
+ def _stringify(value) -> str:
59
+ """Convert a single value to string."""
60
+ if value is None:
61
+ return ""
62
+ if isinstance(value, bool):
63
+ return str(value).lower()
64
+ if isinstance(value, (dict, list)):
65
+ return json.dumps(value, separators=(",", ":"))
66
+ return str(value)
@@ -0,0 +1,49 @@
1
+ """Regex engine — powered by the ``re`` stdlib module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from typing import List
8
+
9
+ from chadselect._functions import split_functions, parse_and_apply
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def process(pattern_with_functions: str, content: str) -> List[str]:
15
+ """Run a regex against content, returning capture groups or full matches.
16
+
17
+ - If the pattern has capture groups, returns group values.
18
+ - Otherwise, returns full match strings.
19
+
20
+ Supports ``>>`` function piping.
21
+ """
22
+ pattern_str, func_chain = split_functions(pattern_with_functions)
23
+
24
+ try:
25
+ compiled = re.compile(pattern_str)
26
+ except re.error as e:
27
+ logger.warning("Invalid regex '%s': %s", pattern_str, e)
28
+ return []
29
+
30
+ results: List[str] = []
31
+
32
+ if compiled.groups == 0:
33
+ # No capture groups — return full matches
34
+ results = compiled.findall(content)
35
+ else:
36
+ # Has capture groups
37
+ for match in compiled.finditer(content):
38
+ groups = match.groups()
39
+ for g in groups:
40
+ if g is not None:
41
+ results.append(g)
42
+
43
+ # Filter empty/whitespace-only
44
+ results = [r.strip() for r in results if r.strip()]
45
+
46
+ if func_chain.strip():
47
+ results = parse_and_apply(results, func_chain)
48
+
49
+ return results
@@ -0,0 +1,48 @@
1
+ """XPath 1.0 engine — powered by lxml (libxml2)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import List
7
+
8
+ from lxml import html as lxml_html
9
+
10
+ from chadselect._functions import split_functions, parse_and_apply
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def process(xpath_with_functions: str, content: str) -> List[str]:
16
+ """Run an XPath 1.0 expression against HTML/text content.
17
+
18
+ Supports ``>>`` function piping.
19
+ """
20
+ xpath_expr, func_chain = split_functions(xpath_with_functions)
21
+
22
+ try:
23
+ tree = lxml_html.fromstring(content)
24
+ raw = tree.xpath(xpath_expr)
25
+ except Exception as e:
26
+ logger.warning("XPath failed for '%s': %s", xpath_expr, e)
27
+ return []
28
+
29
+ # XPath can return a plain string (e.g. normalize-space(), string())
30
+ # instead of a list. Wrap it so the loop below works correctly.
31
+ if isinstance(raw, str):
32
+ raw = [raw]
33
+
34
+ results: List[str] = []
35
+ for item in raw:
36
+ if hasattr(item, "text_content"):
37
+ # It's an Element
38
+ text = item.text_content().strip()
39
+ else:
40
+ # It's a string (text node or attribute)
41
+ text = str(item).strip()
42
+ if text:
43
+ results.append(text)
44
+
45
+ if func_chain.strip():
46
+ results = parse_and_apply(results, func_chain)
47
+
48
+ return results
chadselect/py.typed ADDED
@@ -0,0 +1,23 @@
1
+ from typing import Callable, List, Sequence, Tuple
2
+
3
+ class ChadSelect:
4
+ def __init__(self) -> None: ...
5
+ def add_text(self, content: str) -> None: ...
6
+ def add_html(self, content: str) -> None: ...
7
+ def add_json(self, content: str) -> None: ...
8
+ def content_count(self) -> int: ...
9
+ def clear(self) -> None: ...
10
+ def query(self, index: int, query_str: str) -> List[str]: ...
11
+ def select(self, index: int, query_str: str) -> str: ...
12
+ def select_where(self, index: int, query_str: str, valid: Callable[[str], bool]) -> str: ...
13
+ def select_first(self, queries: Sequence[Tuple[int, str]]) -> List[str]: ...
14
+ def select_first_where(self, queries: Sequence[Tuple[int, str]], valid: Callable[[str], bool]) -> List[str]: ...
15
+ def select_many(self, queries: Sequence[Tuple[int, str]]) -> List[str]: ...
16
+ def select_many_where(self, queries: Sequence[Tuple[int, str]], valid: Callable[[str], bool]) -> List[str]: ...
17
+ def query_batch(self, queries: Sequence[Tuple[int, str]]) -> List[List[str]]: ...
18
+ def __repr__(self) -> str: ...
19
+ def __len__(self) -> int: ...
20
+
21
+ FUNCTION_PIPE: str
22
+ __version__: str
23
+ __all__: List[str]
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.4
2
+ Name: chadselect
3
+ Version: 0.2.0
4
+ Summary: Unified data extraction — CSS, XPath, Regex, and JMESPath behind one query interface.
5
+ Project-URL: Homepage, https://github.com/markjacksoncerberus/chadselect
6
+ Project-URL: Repository, https://github.com/markjacksoncerberus/chadselect
7
+ Project-URL: Issues, https://github.com/markjacksoncerberus/chadselect/issues
8
+ Author: Mark Jackson
9
+ License: MIT
10
+ Keywords: css,extraction,jmespath,parsing,regex,scraping,xpath
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
21
+ Classifier: Programming Language :: Python :: 3.15
22
+ Classifier: Topic :: Text Processing :: Markup :: HTML
23
+ Classifier: Topic :: Text Processing :: Markup :: XML
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.9
26
+ Requires-Dist: jmespath>=1.0
27
+ Requires-Dist: lxml>=5.0
28
+ Requires-Dist: selectolax>=0.3.21
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
31
+ Requires-Dist: pytest>=7.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # ChadSelect
35
+
36
+ Unified data extraction — CSS, XPath, Regex, and JMESPath behind one query interface.
37
+
38
+ ```python
39
+ from chadselect import ChadSelect
40
+
41
+ cs = ChadSelect()
42
+ cs.add_html(html)
43
+ cs.add_json(json_str)
44
+
45
+ # One syntax, four engines
46
+ title = cs.select(0, "css:h1.title")
47
+ author = cs.select(0, "xpath://span[@class='author']/text()")
48
+ vin = cs.select(0, r"regex:[A-HJ-NPR-Z0-9]{17}")
49
+ name = cs.select(0, "json:data.products[0].name")
50
+
51
+ # Function piping
52
+ clean = cs.select(0, "css:.price >> trim >> uppercase()")
53
+ ```
54
+
55
+ ## Install
56
+
57
+ ```bash
58
+ pip install chadselect
59
+ ```
60
+
61
+ ## Query Syntax
62
+
63
+ Queries use a `engine:expression` prefix:
64
+
65
+ | Prefix | Engine | Best For |
66
+ |--------|--------|----------|
67
+ | `css:` | CSS Selectors (selectolax) | HTML element selection |
68
+ | `xpath:` | XPath 1.0 (lxml) | Complex HTML/XML traversal |
69
+ | `regex:` | Regular Expressions (re) | Pattern matching on raw text |
70
+ | `json:` | JMESPath (jmespath) | JSON field extraction |
71
+
72
+ No prefix defaults to regex.
73
+
74
+ ## Function Piping
75
+
76
+ Chain text transformations with `>>`:
77
+
78
+ ```python
79
+ cs.select(0, "css:.price >> trim >> substring-after('$') >> uppercase()")
80
+ ```
81
+
82
+ Available functions: `trim`, `uppercase()`, `lowercase()`, `normalize-space()`,
83
+ `substring-after('delim')`, `substring-before('delim')`, `substring(start, len)`,
84
+ `replace('old', 'new')`, `get-attr('name')`.
85
+
86
+ ## API
87
+
88
+ ```python
89
+ cs = ChadSelect()
90
+
91
+ # Load content
92
+ cs.add_html(html_string)
93
+ cs.add_json(json_string)
94
+ cs.add_text(plain_text)
95
+
96
+ # Query (index: 0=first, -1=all)
97
+ results = cs.query(-1, "css:.price") # List[str] — all matches
98
+ value = cs.select(0, "css:.price") # str — first match or ""
99
+
100
+ # Multi-query
101
+ first_hit = cs.select_first([(0, "css:#id"), (0, "xpath://fallback")])
102
+ combined = cs.select_many([(-1, "css:.a"), (-1, "css:.b")])
103
+
104
+ # Batch (fastest for many fields)
105
+ results = cs.query_batch([(-1, "css:.title"), (-1, "json:data.name")])
106
+
107
+ # With validators
108
+ results = cs.select_where(0, "css:.vin", lambda v: len(v) == 17)
109
+ ```
110
+
111
+ ## License
112
+
113
+ MIT
@@ -0,0 +1,13 @@
1
+ chadselect/__init__.py,sha256=EOImbkqxe148OUS8ZKHAlAnw0zsXyh5vCUjJcJjmJfc,989
2
+ chadselect/_chadselect.py,sha256=sZ7YJnPt7WDogzVTJPLI3FIjThd4Ge5ZjELeP7IIJys,7571
3
+ chadselect/_functions.py,sha256=JeM5a8ZAkoDEOrLilP_ub90eF8fUP-q9n4dgUX_lDcQ,4149
4
+ chadselect/_query.py,sha256=7nNtkcArP8_SAYtkKod-Jrop_9as8RhDIofdwRSROW4,2032
5
+ chadselect/py.typed,sha256=oodZR5L_XXc9i-XuKipIh9EKO__Be3NSzuJcaHrzj_o,1158
6
+ chadselect/engine/__init__.py,sha256=iJuOnTANwq4jWk6lNbjmvRrGYRXgcJYnXsVhDdgMTHY,45
7
+ chadselect/engine/css.py,sha256=Xs_5T9WfIxSpyD1eD458g6g4Te3E7bfmpnmYlEh6hFA,4995
8
+ chadselect/engine/json.py,sha256=ZSx8j2t7n-QJgvslaKwqBAo7TyxAHLcfs04NqoZSS3k,1702
9
+ chadselect/engine/regex.py,sha256=wL7FaX0-3Q7Ib7ZR-8lhfdulPnPxGvQo4A-mbjNPJtc,1363
10
+ chadselect/engine/xpath.py,sha256=QHJvs0xxprJg_htyBTvsBurnY8TXzemav03aaZAFH0w,1338
11
+ chadselect-0.2.0.dist-info/METADATA,sha256=VFdzqtfF5ch6l3erISQHLLeJ_08_Pt0cSY_w6L_a0pk,3411
12
+ chadselect-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ chadselect-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any