web2cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web2cli/__init__.py +3 -0
- web2cli/__main__.py +5 -0
- web2cli/adapter/__init__.py +0 -0
- web2cli/adapter/lint.py +667 -0
- web2cli/adapter/loader.py +157 -0
- web2cli/adapter/validator.py +127 -0
- web2cli/adapters/discord.com/web2cli.yaml +476 -0
- web2cli/adapters/mail.google.com/parsers/inbox.py +200 -0
- web2cli/adapters/mail.google.com/web2cli.yaml +52 -0
- web2cli/adapters/news.ycombinator.com/web2cli.yaml +356 -0
- web2cli/adapters/reddit.com/web2cli.yaml +233 -0
- web2cli/adapters/slack.com/web2cli.yaml +445 -0
- web2cli/adapters/stackoverflow.com/web2cli.yaml +257 -0
- web2cli/adapters/x.com/providers/x_graphql.py +299 -0
- web2cli/adapters/x.com/web2cli.yaml +449 -0
- web2cli/auth/__init__.py +0 -0
- web2cli/auth/browser_login.py +820 -0
- web2cli/auth/manager.py +166 -0
- web2cli/auth/store.py +68 -0
- web2cli/cli.py +1286 -0
- web2cli/executor/__init__.py +0 -0
- web2cli/executor/http.py +113 -0
- web2cli/output/__init__.py +0 -0
- web2cli/output/formatter.py +116 -0
- web2cli/parser/__init__.py +0 -0
- web2cli/parser/custom.py +21 -0
- web2cli/parser/html_parser.py +111 -0
- web2cli/parser/transforms.py +127 -0
- web2cli/pipe.py +10 -0
- web2cli/providers/__init__.py +6 -0
- web2cli/providers/base.py +22 -0
- web2cli/providers/registry.py +86 -0
- web2cli/runtime/__init__.py +1 -0
- web2cli/runtime/cache.py +42 -0
- web2cli/runtime/engine.py +743 -0
- web2cli/runtime/parser.py +398 -0
- web2cli/runtime/template.py +52 -0
- web2cli/types.py +71 -0
- web2cli-0.2.0.dist-info/METADATA +467 -0
- web2cli-0.2.0.dist-info/RECORD +44 -0
- web2cli-0.2.0.dist-info/WHEEL +5 -0
- web2cli-0.2.0.dist-info/entry_points.txt +2 -0
- web2cli-0.2.0.dist-info/licenses/LICENSE +202 -0
- web2cli-0.2.0.dist-info/top_level.txt +1 -0
|
File without changes
|
web2cli/executor/http.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""HTTP request execution via httpx (default) or curl_cffi (TLS impersonation)."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from web2cli.types import Request
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HttpError(Exception):
|
|
12
|
+
def __init__(self, status_code: int, message: str):
|
|
13
|
+
self.status_code = status_code
|
|
14
|
+
super().__init__(message)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def _execute_httpx(request: Request) -> tuple[int, dict, str]:
|
|
18
|
+
"""Execute via httpx (standard path)."""
|
|
19
|
+
content_type = (request.content_type or request.headers.get("Content-Type", "")).lower()
|
|
20
|
+
body_is_form = (
|
|
21
|
+
isinstance(request.body, dict)
|
|
22
|
+
and content_type.startswith("application/x-www-form-urlencoded")
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
async with httpx.AsyncClient(follow_redirects=True) as client:
|
|
27
|
+
response = await client.request(
|
|
28
|
+
method=request.method,
|
|
29
|
+
url=request.url,
|
|
30
|
+
params=request.params or None,
|
|
31
|
+
headers=request.headers,
|
|
32
|
+
cookies=request.cookies,
|
|
33
|
+
content=request.body if isinstance(request.body, (str, bytes)) else None,
|
|
34
|
+
data=request.body if body_is_form else None,
|
|
35
|
+
json=request.body if isinstance(request.body, dict) and not body_is_form else None,
|
|
36
|
+
)
|
|
37
|
+
except httpx.ConnectError:
|
|
38
|
+
raise HttpError(0, f"Connection failed: could not reach {request.url}")
|
|
39
|
+
except httpx.TimeoutException:
|
|
40
|
+
raise HttpError(0, f"Request timed out: {request.url}")
|
|
41
|
+
|
|
42
|
+
return response.status_code, dict(response.headers), response.text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def _execute_impersonate(
|
|
46
|
+
request: Request, impersonate: str
|
|
47
|
+
) -> tuple[int, dict, str]:
|
|
48
|
+
"""Execute via curl_cffi with TLS impersonation."""
|
|
49
|
+
from curl_cffi.requests import AsyncSession
|
|
50
|
+
|
|
51
|
+
content_type = (request.content_type or request.headers.get("Content-Type", "")).lower()
|
|
52
|
+
body_is_form = (
|
|
53
|
+
isinstance(request.body, dict)
|
|
54
|
+
and content_type.startswith("application/x-www-form-urlencoded")
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
async with AsyncSession(impersonate=impersonate) as session:
|
|
59
|
+
response = await session.request(
|
|
60
|
+
method=request.method,
|
|
61
|
+
url=request.url,
|
|
62
|
+
params=request.params or None,
|
|
63
|
+
headers=request.headers,
|
|
64
|
+
cookies=request.cookies,
|
|
65
|
+
data=request.body if body_is_form else (
|
|
66
|
+
request.body if isinstance(request.body, (str, bytes)) else None
|
|
67
|
+
),
|
|
68
|
+
json=request.body if isinstance(request.body, dict) and not body_is_form else None,
|
|
69
|
+
allow_redirects=True,
|
|
70
|
+
)
|
|
71
|
+
except ConnectionError:
|
|
72
|
+
raise HttpError(0, f"Connection failed: could not reach {request.url}")
|
|
73
|
+
except TimeoutError:
|
|
74
|
+
raise HttpError(0, f"Request timed out: {request.url}")
|
|
75
|
+
|
|
76
|
+
return response.status_code, dict(response.headers), response.text
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def execute(
|
|
80
|
+
request: Request, verbose: bool = False, impersonate: str | None = None
|
|
81
|
+
) -> tuple[int, dict, str]:
|
|
82
|
+
"""Execute HTTP request. Returns (status_code, headers, body)."""
|
|
83
|
+
if verbose:
|
|
84
|
+
sys.stderr.write(f"→ {request.method} {request.url}\n")
|
|
85
|
+
if request.params:
|
|
86
|
+
sys.stderr.write(f" params: {request.params}\n")
|
|
87
|
+
if impersonate:
|
|
88
|
+
sys.stderr.write(f" impersonate: {impersonate}\n")
|
|
89
|
+
|
|
90
|
+
start = time.monotonic()
|
|
91
|
+
|
|
92
|
+
if impersonate:
|
|
93
|
+
status, headers, body = await _execute_impersonate(request, impersonate)
|
|
94
|
+
else:
|
|
95
|
+
status, headers, body = await _execute_httpx(request)
|
|
96
|
+
|
|
97
|
+
elapsed = time.monotonic() - start
|
|
98
|
+
|
|
99
|
+
if verbose:
|
|
100
|
+
sys.stderr.write(f"← {status} ({elapsed:.2f}s)\n")
|
|
101
|
+
|
|
102
|
+
if status == 429:
|
|
103
|
+
retry = headers.get("Retry-After", "?")
|
|
104
|
+
raise HttpError(429, f"Rate limited. Try again in {retry} seconds.")
|
|
105
|
+
if status == 403:
|
|
106
|
+
raise HttpError(
|
|
107
|
+
403,
|
|
108
|
+
"Access denied. You may need to login: web2cli login <domain>",
|
|
109
|
+
)
|
|
110
|
+
if status >= 500:
|
|
111
|
+
raise HttpError(status, f"Server error ({status})")
|
|
112
|
+
|
|
113
|
+
return status, headers, body
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Output formatting: table, json, csv, plain, md."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from rich import box
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def format_output(
|
|
13
|
+
records: list[dict],
|
|
14
|
+
fmt: str = "table",
|
|
15
|
+
fields: list[str] | None = None,
|
|
16
|
+
no_color: bool = False,
|
|
17
|
+
no_header: bool = False,
|
|
18
|
+
) -> str:
|
|
19
|
+
"""Format records for stdout.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
records: List of dicts to format.
|
|
23
|
+
fmt: Output format — table, json, csv, plain.
|
|
24
|
+
fields: Which fields to include (None = all).
|
|
25
|
+
no_color: Disable colored output.
|
|
26
|
+
no_header: Omit header row (csv only).
|
|
27
|
+
"""
|
|
28
|
+
if not records:
|
|
29
|
+
return ""
|
|
30
|
+
|
|
31
|
+
# Resolve fields
|
|
32
|
+
if not fields:
|
|
33
|
+
fields = list(records[0].keys())
|
|
34
|
+
|
|
35
|
+
# Filter records to only include requested fields
|
|
36
|
+
filtered = [{k: r.get(k) for k in fields} for r in records]
|
|
37
|
+
|
|
38
|
+
if fmt == "json":
|
|
39
|
+
return _format_json(filtered)
|
|
40
|
+
if fmt == "csv":
|
|
41
|
+
return _format_csv(filtered, fields, no_header=no_header)
|
|
42
|
+
if fmt == "plain":
|
|
43
|
+
return _format_plain(filtered, fields)
|
|
44
|
+
if fmt == "md":
|
|
45
|
+
return _format_markdown(filtered, fields)
|
|
46
|
+
return _format_table(filtered, fields, no_color)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _format_table(records: list[dict], fields: list[str], no_color: bool) -> str:
|
|
50
|
+
"""Rich table output."""
|
|
51
|
+
table_box = box.ASCII2 if no_color else box.HEAVY_HEAD
|
|
52
|
+
table = Table(
|
|
53
|
+
show_header=True,
|
|
54
|
+
header_style=None if no_color else "bold",
|
|
55
|
+
show_lines=False,
|
|
56
|
+
pad_edge=False,
|
|
57
|
+
box=table_box,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
for field in fields:
|
|
61
|
+
table.add_column(field.upper())
|
|
62
|
+
|
|
63
|
+
for record in records:
|
|
64
|
+
row = []
|
|
65
|
+
for field in fields:
|
|
66
|
+
val = record.get(field)
|
|
67
|
+
cell = str(val) if val is not None else ""
|
|
68
|
+
row.append(cell)
|
|
69
|
+
table.add_row(*row)
|
|
70
|
+
|
|
71
|
+
console = Console(no_color=no_color, force_terminal=not no_color)
|
|
72
|
+
with console.capture() as capture:
|
|
73
|
+
console.print(table)
|
|
74
|
+
return capture.get().rstrip()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _format_json(records: list[dict]) -> str:
|
|
78
|
+
"""JSON array output."""
|
|
79
|
+
return json.dumps(records, indent=2, ensure_ascii=False)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _format_csv(records: list[dict], fields: list[str], no_header: bool = False) -> str:
|
|
83
|
+
"""CSV output."""
|
|
84
|
+
buf = io.StringIO()
|
|
85
|
+
writer = csv.DictWriter(buf, fieldnames=fields, extrasaction="ignore")
|
|
86
|
+
if not no_header:
|
|
87
|
+
writer.writeheader()
|
|
88
|
+
writer.writerows(records)
|
|
89
|
+
return buf.getvalue().rstrip()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _format_markdown(records: list[dict], fields: list[str]) -> str:
|
|
93
|
+
"""Markdown table output."""
|
|
94
|
+
headers = [f.upper() for f in fields]
|
|
95
|
+
lines = ["| " + " | ".join(headers) + " |"]
|
|
96
|
+
lines.append("| " + " | ".join("---" for _ in fields) + " |")
|
|
97
|
+
for record in records:
|
|
98
|
+
cells = []
|
|
99
|
+
for field in fields:
|
|
100
|
+
val = record.get(field)
|
|
101
|
+
cell = str(val) if val is not None else ""
|
|
102
|
+
cell = cell.replace("|", "\\|")
|
|
103
|
+
cells.append(cell)
|
|
104
|
+
lines.append("| " + " | ".join(cells) + " |")
|
|
105
|
+
return "\n".join(lines)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _format_plain(records: list[dict], fields: list[str]) -> str:
|
|
109
|
+
"""Plain output — first field only, one per line. Best for piping."""
|
|
110
|
+
first_field = fields[0]
|
|
111
|
+
lines = []
|
|
112
|
+
for record in records:
|
|
113
|
+
val = record.get(first_field)
|
|
114
|
+
if val is not None:
|
|
115
|
+
lines.append(str(val))
|
|
116
|
+
return "\n".join(lines)
|
|
File without changes
|
web2cli/parser/custom.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Dynamic import and execution of custom parser scripts."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_custom(
|
|
8
|
+
script_path: str,
|
|
9
|
+
adapter_dir: Path,
|
|
10
|
+
status_code: int,
|
|
11
|
+
headers: dict,
|
|
12
|
+
body: str,
|
|
13
|
+
args: dict,
|
|
14
|
+
) -> list[dict]:
|
|
15
|
+
"""Import and call a custom parser script."""
|
|
16
|
+
full_path = adapter_dir / script_path
|
|
17
|
+
spec = importlib.util.spec_from_file_location("custom_parser", full_path)
|
|
18
|
+
module = importlib.util.module_from_spec(spec)
|
|
19
|
+
spec.loader.exec_module(module)
|
|
20
|
+
|
|
21
|
+
return module.parse(status_code, headers, body, args)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""HTML response parser using selectolax."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from selectolax.parser import HTMLParser
|
|
6
|
+
|
|
7
|
+
from web2cli.parser.transforms import apply_transform
|
|
8
|
+
|
|
9
|
+
# Page titles that indicate bot/CAPTCHA blocking
|
|
10
|
+
_BLOCK_SIGNALS = ("human verification", "captcha", "access denied", "just a moment")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_html(body: str, response_spec: dict, disable_truncate: bool = False) -> list[dict]:
|
|
14
|
+
"""Parse HTML response using CSS selectors from spec."""
|
|
15
|
+
tree = HTMLParser(body)
|
|
16
|
+
|
|
17
|
+
# Extract items matching the top-level selector
|
|
18
|
+
extract_selector = response_spec.get("extract", "body")
|
|
19
|
+
items = tree.css(extract_selector)
|
|
20
|
+
|
|
21
|
+
if not items:
|
|
22
|
+
# Detect CAPTCHA / bot-blocking pages
|
|
23
|
+
title_el = tree.css_first("title")
|
|
24
|
+
if title_el:
|
|
25
|
+
title = title_el.text(strip=True).lower()
|
|
26
|
+
if any(s in title for s in _BLOCK_SIGNALS):
|
|
27
|
+
print(
|
|
28
|
+
f"Blocked by site ({title_el.text(strip=True)}). "
|
|
29
|
+
"Try again later or use `web2cli login` to add cookies.",
|
|
30
|
+
file=sys.stderr,
|
|
31
|
+
)
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
fields = response_spec.get("fields", [])
|
|
35
|
+
if not fields:
|
|
36
|
+
return [{"text": node.text(strip=True)} for node in items]
|
|
37
|
+
|
|
38
|
+
records = []
|
|
39
|
+
for item in items:
|
|
40
|
+
record = {}
|
|
41
|
+
for field_spec in fields:
|
|
42
|
+
name = field_spec["name"]
|
|
43
|
+
path = field_spec.get("path", "")
|
|
44
|
+
attribute = field_spec.get("attribute", "text")
|
|
45
|
+
collect = field_spec.get("collect", False)
|
|
46
|
+
join_sep = field_spec.get("join", ", ")
|
|
47
|
+
prefix = field_spec.get("prefix", "")
|
|
48
|
+
relative = field_spec.get("relative", "self")
|
|
49
|
+
|
|
50
|
+
target = item
|
|
51
|
+
if relative == "next":
|
|
52
|
+
target = _next_element(item.next)
|
|
53
|
+
elif relative == "parent":
|
|
54
|
+
target = item.parent
|
|
55
|
+
|
|
56
|
+
if collect:
|
|
57
|
+
# Collect multiple matching elements
|
|
58
|
+
sub_nodes = target.css(path) if target else []
|
|
59
|
+
values = [_extract_attr(n, attribute) for n in sub_nodes]
|
|
60
|
+
values = [v for v in values if v]
|
|
61
|
+
value = join_sep.join(values)
|
|
62
|
+
else:
|
|
63
|
+
# Single element
|
|
64
|
+
node = target.css_first(path) if (target and path) else target
|
|
65
|
+
value = _extract_attr(node, attribute) if node else None
|
|
66
|
+
|
|
67
|
+
# Apply prefix
|
|
68
|
+
if prefix and value:
|
|
69
|
+
value = prefix + value
|
|
70
|
+
|
|
71
|
+
# Apply transform
|
|
72
|
+
transform = field_spec.get("transform")
|
|
73
|
+
if transform:
|
|
74
|
+
value = apply_transform(value, transform, disable_truncate=disable_truncate)
|
|
75
|
+
|
|
76
|
+
# Apply truncation (display hint)
|
|
77
|
+
truncate = field_spec.get("truncate")
|
|
78
|
+
if (
|
|
79
|
+
not disable_truncate
|
|
80
|
+
and truncate
|
|
81
|
+
and value
|
|
82
|
+
and isinstance(value, str)
|
|
83
|
+
and len(value) > truncate
|
|
84
|
+
):
|
|
85
|
+
value = value[:truncate] + "..."
|
|
86
|
+
|
|
87
|
+
# Default fallback
|
|
88
|
+
if value is None:
|
|
89
|
+
value = field_spec.get("default")
|
|
90
|
+
|
|
91
|
+
record[name] = value
|
|
92
|
+
records.append(record)
|
|
93
|
+
|
|
94
|
+
return records
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _extract_attr(node, attribute: str) -> str | None:
|
|
98
|
+
"""Extract an attribute value from a selectolax node."""
|
|
99
|
+
if node is None:
|
|
100
|
+
return None
|
|
101
|
+
if attribute == "text":
|
|
102
|
+
return node.text(strip=True) or None
|
|
103
|
+
return node.attributes.get(attribute)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _next_element(node):
|
|
107
|
+
"""Return next element-like node, skipping text/whitespace nodes."""
|
|
108
|
+
cur = node
|
|
109
|
+
while cur is not None and not hasattr(cur, "css_first"):
|
|
110
|
+
cur = cur.next
|
|
111
|
+
return cur
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Shared value transforms used by JSON and HTML parsers."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import re
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def apply_transform(value, transform: str, disable_truncate: bool = False):
|
|
9
|
+
"""Apply a named transform to a value."""
|
|
10
|
+
if value is None:
|
|
11
|
+
return value
|
|
12
|
+
|
|
13
|
+
if transform == "round":
|
|
14
|
+
try:
|
|
15
|
+
return round(float(value))
|
|
16
|
+
except (ValueError, TypeError):
|
|
17
|
+
return value
|
|
18
|
+
|
|
19
|
+
if transform == "int":
|
|
20
|
+
text = str(value).strip().lower().replace(",", "")
|
|
21
|
+
if text.endswith("k"):
|
|
22
|
+
try:
|
|
23
|
+
return int(float(text[:-1]) * 1000)
|
|
24
|
+
except (ValueError, TypeError):
|
|
25
|
+
pass
|
|
26
|
+
if text.endswith("m"):
|
|
27
|
+
try:
|
|
28
|
+
return int(float(text[:-1]) * 1000000)
|
|
29
|
+
except (ValueError, TypeError):
|
|
30
|
+
pass
|
|
31
|
+
try:
|
|
32
|
+
return int(float(text))
|
|
33
|
+
except (ValueError, TypeError):
|
|
34
|
+
m = re.search(r"-?\d[\d,]*", text)
|
|
35
|
+
if m:
|
|
36
|
+
try:
|
|
37
|
+
return int(m.group(0).replace(",", ""))
|
|
38
|
+
except ValueError:
|
|
39
|
+
return value
|
|
40
|
+
return value
|
|
41
|
+
|
|
42
|
+
if transform == "lowercase":
|
|
43
|
+
return str(value).lower()
|
|
44
|
+
|
|
45
|
+
if transform == "uppercase":
|
|
46
|
+
return str(value).upper()
|
|
47
|
+
|
|
48
|
+
if transform == "strip_html":
|
|
49
|
+
text = re.sub(r"<[^>]+>", " ", str(value))
|
|
50
|
+
text = re.sub(r"<", "<", text)
|
|
51
|
+
text = re.sub(r">", ">", text)
|
|
52
|
+
text = re.sub(r"&", "&", text)
|
|
53
|
+
text = re.sub(r""", '"', text)
|
|
54
|
+
text = re.sub(r"'", "'", text)
|
|
55
|
+
text = re.sub(r" ", " ", text)
|
|
56
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
57
|
+
|
|
58
|
+
if transform == "timestamp":
|
|
59
|
+
return _parse_timestamp(value)
|
|
60
|
+
|
|
61
|
+
if transform == "x_datetime":
|
|
62
|
+
return _parse_twitter_datetime(value)
|
|
63
|
+
|
|
64
|
+
if transform == "x_date":
|
|
65
|
+
full = _parse_twitter_datetime(value)
|
|
66
|
+
if isinstance(full, str) and len(full) >= 10:
|
|
67
|
+
return full[:10]
|
|
68
|
+
return full
|
|
69
|
+
|
|
70
|
+
if transform.startswith("truncate:"):
|
|
71
|
+
if disable_truncate:
|
|
72
|
+
return value
|
|
73
|
+
try:
|
|
74
|
+
n = int(transform.split(":")[1])
|
|
75
|
+
s = str(value)
|
|
76
|
+
return s[:n] + "..." if len(s) > n else s
|
|
77
|
+
except (ValueError, IndexError):
|
|
78
|
+
return value
|
|
79
|
+
|
|
80
|
+
return value
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _parse_timestamp(value) -> str:
|
|
84
|
+
"""Convert various timestamp formats to readable string."""
|
|
85
|
+
# Unix timestamp (int or float)
|
|
86
|
+
if isinstance(value, (int, float)):
|
|
87
|
+
if value > 1e12:
|
|
88
|
+
value = value / 1000 # milliseconds
|
|
89
|
+
try:
|
|
90
|
+
dt = datetime.fromtimestamp(value, tz=timezone.utc)
|
|
91
|
+
return dt.strftime("%Y-%m-%d %H:%M")
|
|
92
|
+
except (OSError, ValueError):
|
|
93
|
+
return str(value)
|
|
94
|
+
|
|
95
|
+
# ISO string
|
|
96
|
+
if isinstance(value, str):
|
|
97
|
+
# numeric string (unix seconds / milliseconds)
|
|
98
|
+
if re.fullmatch(r"\d+(\.\d+)?", value.strip()):
|
|
99
|
+
try:
|
|
100
|
+
num = float(value.strip())
|
|
101
|
+
if num > 1e12:
|
|
102
|
+
num = num / 1000
|
|
103
|
+
dt = datetime.fromtimestamp(num, tz=timezone.utc)
|
|
104
|
+
return dt.strftime("%Y-%m-%d %H:%M")
|
|
105
|
+
except (ValueError, OSError):
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
for fmt in ("%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S"):
|
|
109
|
+
try:
|
|
110
|
+
dt = datetime.strptime(value, fmt).replace(tzinfo=timezone.utc)
|
|
111
|
+
return dt.strftime("%Y-%m-%d %H:%M")
|
|
112
|
+
except ValueError:
|
|
113
|
+
continue
|
|
114
|
+
return value
|
|
115
|
+
|
|
116
|
+
return str(value)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _parse_twitter_datetime(value) -> str:
|
|
120
|
+
"""Convert X/Twitter datetime to readable format."""
|
|
121
|
+
if not isinstance(value, str):
|
|
122
|
+
return str(value)
|
|
123
|
+
try:
|
|
124
|
+
dt = datetime.strptime(value, "%a %b %d %H:%M:%S %z %Y")
|
|
125
|
+
return dt.strftime("%Y-%m-%d %H:%M")
|
|
126
|
+
except ValueError:
|
|
127
|
+
return value
|
web2cli/pipe.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Provider base class for request generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from web2cli.types import AdapterSpec, Request, Session
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Provider:
|
|
11
|
+
"""Provider plugin contract."""
|
|
12
|
+
|
|
13
|
+
name: str = ""
|
|
14
|
+
|
|
15
|
+
def build_request(
|
|
16
|
+
self,
|
|
17
|
+
spec: dict[str, Any],
|
|
18
|
+
ctx: dict[str, Any],
|
|
19
|
+
adapter: AdapterSpec,
|
|
20
|
+
session: Session | None,
|
|
21
|
+
) -> Request:
|
|
22
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Provider registry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib.util
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from web2cli.providers.base import Provider
|
|
10
|
+
from web2cli.types import AdapterSpec
|
|
11
|
+
|
|
12
|
+
_PROVIDERS: dict[str, Provider] = {}
|
|
13
|
+
_BUILTINS_REGISTERED = False
|
|
14
|
+
_DYNAMIC_MODULES_LOADED: set[str] = set()
|
|
15
|
+
|
|
16
|
+
_BUILTIN_ADAPTERS_DIR = Path(__file__).resolve().parent.parent / "adapters"
|
|
17
|
+
_USER_ADAPTERS_DIR = Path.home() / ".web2cli" / "adapters"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def register_provider(provider: Provider) -> None:
|
|
21
|
+
if not provider.name:
|
|
22
|
+
raise ValueError("Provider must have a name")
|
|
23
|
+
_PROVIDERS[provider.name] = provider
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _register_builtins_once() -> None:
|
|
27
|
+
global _BUILTINS_REGISTERED
|
|
28
|
+
if _BUILTINS_REGISTERED:
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
_BUILTINS_REGISTERED = True
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _safe_ident(value: str) -> str:
|
|
35
|
+
return re.sub(r"[^A-Za-z0-9_]+", "_", value)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _load_provider_module(module_path: Path, key: str) -> None:
|
|
39
|
+
if key in _DYNAMIC_MODULES_LOADED:
|
|
40
|
+
return
|
|
41
|
+
if not module_path.is_file():
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
module_name = f"web2cli_dynamic_provider_{_safe_ident(key)}"
|
|
45
|
+
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
|
46
|
+
if spec is None or spec.loader is None:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
module = importlib.util.module_from_spec(spec)
|
|
50
|
+
spec.loader.exec_module(module)
|
|
51
|
+
_DYNAMIC_MODULES_LOADED.add(key)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _load_from_adapter(adapter: AdapterSpec, provider_name: str) -> None:
|
|
55
|
+
if adapter.adapter_dir is None:
|
|
56
|
+
return
|
|
57
|
+
provider_path = adapter.adapter_dir / "providers" / f"{provider_name}.py"
|
|
58
|
+
key = f"{adapter.meta.domain}:{provider_name}:{provider_path}"
|
|
59
|
+
_load_provider_module(provider_path, key)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_from_known_adapter_dirs(provider_name: str) -> None:
|
|
63
|
+
for base in (_BUILTIN_ADAPTERS_DIR, _USER_ADAPTERS_DIR):
|
|
64
|
+
if not base.is_dir():
|
|
65
|
+
continue
|
|
66
|
+
for adapter_dir in base.iterdir():
|
|
67
|
+
provider_path = adapter_dir / "providers" / f"{provider_name}.py"
|
|
68
|
+
key = f"{adapter_dir}:{provider_name}:{provider_path}"
|
|
69
|
+
_load_provider_module(provider_path, key)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_provider(name: str, adapter: AdapterSpec | None = None) -> Provider:
|
|
73
|
+
_register_builtins_once()
|
|
74
|
+
provider = _PROVIDERS.get(name)
|
|
75
|
+
|
|
76
|
+
if provider is None and adapter is not None:
|
|
77
|
+
_load_from_adapter(adapter, name)
|
|
78
|
+
provider = _PROVIDERS.get(name)
|
|
79
|
+
|
|
80
|
+
if provider is None:
|
|
81
|
+
_load_from_known_adapter_dirs(name)
|
|
82
|
+
provider = _PROVIDERS.get(name)
|
|
83
|
+
|
|
84
|
+
if provider is None:
|
|
85
|
+
raise ValueError(f"Unknown provider: {name}")
|
|
86
|
+
return provider
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""web2cli runtime modules."""
|
web2cli/runtime/cache.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Small file cache for runtime resources."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
BASE_CACHE_DIR = Path.home() / ".web2cli" / "cache"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _cache_path(domain: str, key: str) -> Path:
|
|
15
|
+
digest = hashlib.sha1(key.encode()).hexdigest() # nosec: non-crypto use
|
|
16
|
+
return BASE_CACHE_DIR / domain / "runtime" / f"{digest}.json"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_cache(domain: str, key: str, ttl: int | None = None) -> Any | None:
|
|
20
|
+
"""Load cached payload if present and not expired."""
|
|
21
|
+
path = _cache_path(domain, key)
|
|
22
|
+
if not path.is_file():
|
|
23
|
+
return None
|
|
24
|
+
try:
|
|
25
|
+
data = json.loads(path.read_text())
|
|
26
|
+
except (json.JSONDecodeError, OSError):
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
ts = data.get("ts")
|
|
30
|
+
if ttl and isinstance(ts, (int, float)):
|
|
31
|
+
if time.time() - ts > ttl:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
return data.get("payload")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def save_cache(domain: str, key: str, payload: Any) -> None:
|
|
38
|
+
"""Persist payload in cache."""
|
|
39
|
+
path = _cache_path(domain, key)
|
|
40
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
doc = {"ts": time.time(), "payload": payload}
|
|
42
|
+
path.write_text(json.dumps(doc))
|