som-parser 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- som_parser-0.3.0/.gitignore +22 -0
- som_parser-0.3.0/PKG-INFO +144 -0
- som_parser-0.3.0/README.md +129 -0
- som_parser-0.3.0/pyproject.toml +24 -0
- som_parser-0.3.0/som_parser/__init__.py +72 -0
- som_parser-0.3.0/som_parser/parser.py +79 -0
- som_parser-0.3.0/som_parser/query.py +221 -0
- som_parser-0.3.0/som_parser/types.py +152 -0
- som_parser-0.3.0/tests/__init__.py +0 -0
- som_parser-0.3.0/tests/test_parser.py +421 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/target
|
|
2
|
+
*.swp
|
|
3
|
+
*.swo
|
|
4
|
+
*~
|
|
5
|
+
.DS_Store
|
|
6
|
+
report.md
|
|
7
|
+
.claude/settings.local.json
|
|
8
|
+
smoke/node_modules/
|
|
9
|
+
sdk/node/node_modules/
|
|
10
|
+
sdk/node/dist/
|
|
11
|
+
sdk/python/*.egg-info/
|
|
12
|
+
sdk/python/dist/
|
|
13
|
+
sdk/python/build/
|
|
14
|
+
__pycache__/
|
|
15
|
+
*.pyc
|
|
16
|
+
|
|
17
|
+
# MCP registry publisher tokens
|
|
18
|
+
.mcpregistry_*
|
|
19
|
+
.vercel
|
|
20
|
+
packages/*/node_modules/
|
|
21
|
+
packages/*/.venv/
|
|
22
|
+
packages/*/dist/
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: som-parser
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Parse and query SOM (Semantic Object Model) - the structured web format for AI agents
|
|
5
|
+
Project-URL: Homepage, https://plasmate.app
|
|
6
|
+
Project-URL: Documentation, https://plasmate.app/docs/som-spec
|
|
7
|
+
Project-URL: Repository, https://github.com/plasmate-labs/plasmate
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Keywords: ai-agents,browser-automation,plasmate,semantic-object-model,som,web-scraping
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Requires-Dist: pydantic>=2.0
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# som-parser
|
|
17
|
+
|
|
18
|
+
Parse and query [SOM (Semantic Object Model)](https://plasmate.app/docs/som-spec) output in Python. SOM is a structured JSON format that represents web pages as semantic regions and elements, designed for AI agents, browser automation, and web scraping. This library provides Pydantic v2 models for type-safe parsing, validation, and a rich set of query utilities to extract exactly what you need.
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install som-parser
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
### Parse Plasmate output
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import subprocess
|
|
32
|
+
from som_parser import parse_som, from_plasmate
|
|
33
|
+
|
|
34
|
+
# Parse a SOM JSON string or dict
|
|
35
|
+
som = parse_som('{"som_version": "0.1", ...}')
|
|
36
|
+
|
|
37
|
+
# Or parse raw Plasmate CLI output directly
|
|
38
|
+
result = subprocess.run(["plasmate", "https://example.com"], capture_output=True, text=True)
|
|
39
|
+
som = from_plasmate(result.stdout)
|
|
40
|
+
|
|
41
|
+
print(som.title) # "Example Domain"
|
|
42
|
+
print(som.url) # "https://example.com/"
|
|
43
|
+
print(som.som_version) # "0.1"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Find links
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from som_parser import parse_som, get_links, find_by_role
|
|
50
|
+
|
|
51
|
+
som = parse_som(data)
|
|
52
|
+
|
|
53
|
+
# Get all links as simple dicts
|
|
54
|
+
for link in get_links(som):
|
|
55
|
+
print(f"{link['text']} -> {link['href']}")
|
|
56
|
+
|
|
57
|
+
# Or find by role for full SomElement objects
|
|
58
|
+
for el in find_by_role(som, "link"):
|
|
59
|
+
print(el.id, el.text, el.attrs.href)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Get interactive elements
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from som_parser import parse_som, get_interactive_elements
|
|
66
|
+
|
|
67
|
+
som = parse_som(data)
|
|
68
|
+
for el in get_interactive_elements(som):
|
|
69
|
+
print(f"{el.id}: {el.role.value} - actions: {[a.value for a in el.actions]}")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Convert to markdown
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from som_parser import parse_som, to_markdown
|
|
76
|
+
|
|
77
|
+
som = parse_som(data)
|
|
78
|
+
print(to_markdown(som))
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Use Pydantic models directly
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from som_parser import Som, SomElement, ElementRole
|
|
85
|
+
|
|
86
|
+
# Validate and construct from a dict
|
|
87
|
+
som = Som.model_validate(my_dict)
|
|
88
|
+
|
|
89
|
+
# Access typed fields
|
|
90
|
+
for region in som.regions:
|
|
91
|
+
for element in region.elements:
|
|
92
|
+
if element.role == ElementRole.LINK:
|
|
93
|
+
print(element.attrs.href)
|
|
94
|
+
|
|
95
|
+
# Serialize back to JSON
|
|
96
|
+
print(som.model_dump_json(indent=2))
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## API Reference
|
|
100
|
+
|
|
101
|
+
### Parser
|
|
102
|
+
|
|
103
|
+
| Function | Description |
|
|
104
|
+
|----------|-------------|
|
|
105
|
+
| `parse_som(input: str \| dict) -> Som` | Parse JSON string or dict into a validated Som object |
|
|
106
|
+
| `is_valid_som(input) -> bool` | Check if input conforms to the SOM schema |
|
|
107
|
+
| `from_plasmate(json_output: str) -> Som` | Parse raw Plasmate CLI JSON output |
|
|
108
|
+
|
|
109
|
+
### Query Utilities
|
|
110
|
+
|
|
111
|
+
| Function | Description |
|
|
112
|
+
|----------|-------------|
|
|
113
|
+
| `get_all_elements(som) -> list[SomElement]` | Flatten all elements from all regions |
|
|
114
|
+
| `find_by_role(som, role) -> list[SomElement]` | Find elements by role (enum or string) |
|
|
115
|
+
| `find_by_id(som, id) -> SomElement \| None` | Find a single element by its SOM id |
|
|
116
|
+
| `find_by_text(som, text, exact=False) -> list[SomElement]` | Search elements by text content |
|
|
117
|
+
| `get_interactive_elements(som) -> list[SomElement]` | Get elements that have actions |
|
|
118
|
+
| `get_links(som) -> list[dict]` | Extract all links as `{text, href, id}` dicts |
|
|
119
|
+
| `get_forms(som) -> list[SomRegion]` | Get all form regions |
|
|
120
|
+
| `get_inputs(som) -> list[SomElement]` | Get all input elements |
|
|
121
|
+
| `get_headings(som) -> list[dict]` | Extract heading hierarchy as `{level, text, id}` |
|
|
122
|
+
| `get_text(som) -> str` | Extract all visible text content |
|
|
123
|
+
| `get_text_by_region(som) -> list[dict]` | Extract text grouped by region |
|
|
124
|
+
| `get_compression_ratio(som) -> float` | Return `html_bytes / som_bytes` |
|
|
125
|
+
| `to_markdown(som) -> str` | Convert SOM to readable markdown |
|
|
126
|
+
| `filter_elements(som, predicate) -> list[SomElement]` | Generic filter with a callable |
|
|
127
|
+
|
|
128
|
+
### Types
|
|
129
|
+
|
|
130
|
+
All Pydantic v2 models are exported from the top level:
|
|
131
|
+
|
|
132
|
+
- `Som`, `SomRegion`, `SomElement`, `SomElementAttrs`, `SomMeta`
|
|
133
|
+
- `StructuredData`, `LinkElement`, `SelectOption`, `ListItem`
|
|
134
|
+
- `RegionRole`, `ElementRole`, `ElementAction`, `SemanticHint` (enums)
|
|
135
|
+
|
|
136
|
+
## Links
|
|
137
|
+
|
|
138
|
+
- [SOM Spec](https://plasmate.app/docs/som-spec)
|
|
139
|
+
- [Plasmate](https://plasmate.app)
|
|
140
|
+
- [GitHub Repository](https://github.com/plasmate-labs/plasmate)
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
Apache-2.0
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# som-parser
|
|
2
|
+
|
|
3
|
+
Parse and query [SOM (Semantic Object Model)](https://plasmate.app/docs/som-spec) output in Python. SOM is a structured JSON format that represents web pages as semantic regions and elements, designed for AI agents, browser automation, and web scraping. This library provides Pydantic v2 models for type-safe parsing, validation, and a rich set of query utilities to extract exactly what you need.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install som-parser
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
### Parse Plasmate output
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import subprocess
|
|
17
|
+
from som_parser import parse_som, from_plasmate
|
|
18
|
+
|
|
19
|
+
# Parse a SOM JSON string or dict
|
|
20
|
+
som = parse_som('{"som_version": "0.1", ...}')
|
|
21
|
+
|
|
22
|
+
# Or parse raw Plasmate CLI output directly
|
|
23
|
+
result = subprocess.run(["plasmate", "https://example.com"], capture_output=True, text=True)
|
|
24
|
+
som = from_plasmate(result.stdout)
|
|
25
|
+
|
|
26
|
+
print(som.title) # "Example Domain"
|
|
27
|
+
print(som.url) # "https://example.com/"
|
|
28
|
+
print(som.som_version) # "0.1"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Find links
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from som_parser import parse_som, get_links, find_by_role
|
|
35
|
+
|
|
36
|
+
som = parse_som(data)
|
|
37
|
+
|
|
38
|
+
# Get all links as simple dicts
|
|
39
|
+
for link in get_links(som):
|
|
40
|
+
print(f"{link['text']} -> {link['href']}")
|
|
41
|
+
|
|
42
|
+
# Or find by role for full SomElement objects
|
|
43
|
+
for el in find_by_role(som, "link"):
|
|
44
|
+
print(el.id, el.text, el.attrs.href)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Get interactive elements
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from som_parser import parse_som, get_interactive_elements
|
|
51
|
+
|
|
52
|
+
som = parse_som(data)
|
|
53
|
+
for el in get_interactive_elements(som):
|
|
54
|
+
print(f"{el.id}: {el.role.value} - actions: {[a.value for a in el.actions]}")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Convert to markdown
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from som_parser import parse_som, to_markdown
|
|
61
|
+
|
|
62
|
+
som = parse_som(data)
|
|
63
|
+
print(to_markdown(som))
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Use Pydantic models directly
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from som_parser import Som, SomElement, ElementRole
|
|
70
|
+
|
|
71
|
+
# Validate and construct from a dict
|
|
72
|
+
som = Som.model_validate(my_dict)
|
|
73
|
+
|
|
74
|
+
# Access typed fields
|
|
75
|
+
for region in som.regions:
|
|
76
|
+
for element in region.elements:
|
|
77
|
+
if element.role == ElementRole.LINK:
|
|
78
|
+
print(element.attrs.href)
|
|
79
|
+
|
|
80
|
+
# Serialize back to JSON
|
|
81
|
+
print(som.model_dump_json(indent=2))
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## API Reference
|
|
85
|
+
|
|
86
|
+
### Parser
|
|
87
|
+
|
|
88
|
+
| Function | Description |
|
|
89
|
+
|----------|-------------|
|
|
90
|
+
| `parse_som(input: str \| dict) -> Som` | Parse JSON string or dict into a validated Som object |
|
|
91
|
+
| `is_valid_som(input) -> bool` | Check if input conforms to the SOM schema |
|
|
92
|
+
| `from_plasmate(json_output: str) -> Som` | Parse raw Plasmate CLI JSON output |
|
|
93
|
+
|
|
94
|
+
### Query Utilities
|
|
95
|
+
|
|
96
|
+
| Function | Description |
|
|
97
|
+
|----------|-------------|
|
|
98
|
+
| `get_all_elements(som) -> list[SomElement]` | Flatten all elements from all regions |
|
|
99
|
+
| `find_by_role(som, role) -> list[SomElement]` | Find elements by role (enum or string) |
|
|
100
|
+
| `find_by_id(som, id) -> SomElement \| None` | Find a single element by its SOM id |
|
|
101
|
+
| `find_by_text(som, text, exact=False) -> list[SomElement]` | Search elements by text content |
|
|
102
|
+
| `get_interactive_elements(som) -> list[SomElement]` | Get elements that have actions |
|
|
103
|
+
| `get_links(som) -> list[dict]` | Extract all links as `{text, href, id}` dicts |
|
|
104
|
+
| `get_forms(som) -> list[SomRegion]` | Get all form regions |
|
|
105
|
+
| `get_inputs(som) -> list[SomElement]` | Get all input elements |
|
|
106
|
+
| `get_headings(som) -> list[dict]` | Extract heading hierarchy as `{level, text, id}` |
|
|
107
|
+
| `get_text(som) -> str` | Extract all visible text content |
|
|
108
|
+
| `get_text_by_region(som) -> list[dict]` | Extract text grouped by region |
|
|
109
|
+
| `get_compression_ratio(som) -> float` | Return `html_bytes / som_bytes` |
|
|
110
|
+
| `to_markdown(som) -> str` | Convert SOM to readable markdown |
|
|
111
|
+
| `filter_elements(som, predicate) -> list[SomElement]` | Generic filter with a callable |
|
|
112
|
+
|
|
113
|
+
### Types
|
|
114
|
+
|
|
115
|
+
All Pydantic v2 models are exported from the top level:
|
|
116
|
+
|
|
117
|
+
- `Som`, `SomRegion`, `SomElement`, `SomElementAttrs`, `SomMeta`
|
|
118
|
+
- `StructuredData`, `LinkElement`, `SelectOption`, `ListItem`
|
|
119
|
+
- `RegionRole`, `ElementRole`, `ElementAction`, `SemanticHint` (enums)
|
|
120
|
+
|
|
121
|
+
## Links
|
|
122
|
+
|
|
123
|
+
- [SOM Spec](https://plasmate.app/docs/som-spec)
|
|
124
|
+
- [Plasmate](https://plasmate.app)
|
|
125
|
+
- [GitHub Repository](https://github.com/plasmate-labs/plasmate)
|
|
126
|
+
|
|
127
|
+
## License
|
|
128
|
+
|
|
129
|
+
Apache-2.0
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "som-parser"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Parse and query SOM (Semantic Object Model) - the structured web format for AI agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
dependencies = ["pydantic>=2.0"]
|
|
13
|
+
keywords = ["som", "semantic-object-model", "web-scraping", "ai-agents", "browser-automation", "plasmate"]
|
|
14
|
+
|
|
15
|
+
[project.optional-dependencies]
|
|
16
|
+
dev = ["pytest>=7.0"]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://plasmate.app"
|
|
20
|
+
Documentation = "https://plasmate.app/docs/som-spec"
|
|
21
|
+
Repository = "https://github.com/plasmate-labs/plasmate"
|
|
22
|
+
|
|
23
|
+
[tool.pytest.ini_options]
|
|
24
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""som-parser: Parse and query SOM (Semantic Object Model) output."""
|
|
2
|
+
|
|
3
|
+
from .types import (
|
|
4
|
+
ElementAction,
|
|
5
|
+
ElementRole,
|
|
6
|
+
LinkElement,
|
|
7
|
+
ListItem,
|
|
8
|
+
RegionRole,
|
|
9
|
+
SelectOption,
|
|
10
|
+
SemanticHint,
|
|
11
|
+
Som,
|
|
12
|
+
SomElement,
|
|
13
|
+
SomElementAttrs,
|
|
14
|
+
SomMeta,
|
|
15
|
+
SomRegion,
|
|
16
|
+
StructuredData,
|
|
17
|
+
)
|
|
18
|
+
from .parser import from_plasmate, is_valid_som, parse_som
|
|
19
|
+
from .query import (
|
|
20
|
+
filter_elements,
|
|
21
|
+
find_by_id,
|
|
22
|
+
find_by_role,
|
|
23
|
+
find_by_text,
|
|
24
|
+
get_all_elements,
|
|
25
|
+
get_compression_ratio,
|
|
26
|
+
get_forms,
|
|
27
|
+
get_headings,
|
|
28
|
+
get_inputs,
|
|
29
|
+
get_interactive_elements,
|
|
30
|
+
get_links,
|
|
31
|
+
get_text,
|
|
32
|
+
get_text_by_region,
|
|
33
|
+
to_markdown,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
# Types
|
|
38
|
+
"ElementAction",
|
|
39
|
+
"ElementRole",
|
|
40
|
+
"LinkElement",
|
|
41
|
+
"ListItem",
|
|
42
|
+
"RegionRole",
|
|
43
|
+
"SelectOption",
|
|
44
|
+
"SemanticHint",
|
|
45
|
+
"Som",
|
|
46
|
+
"SomElement",
|
|
47
|
+
"SomElementAttrs",
|
|
48
|
+
"SomMeta",
|
|
49
|
+
"SomRegion",
|
|
50
|
+
"StructuredData",
|
|
51
|
+
# Parser
|
|
52
|
+
"from_plasmate",
|
|
53
|
+
"is_valid_som",
|
|
54
|
+
"parse_som",
|
|
55
|
+
# Query
|
|
56
|
+
"filter_elements",
|
|
57
|
+
"find_by_id",
|
|
58
|
+
"find_by_role",
|
|
59
|
+
"find_by_text",
|
|
60
|
+
"get_all_elements",
|
|
61
|
+
"get_compression_ratio",
|
|
62
|
+
"get_forms",
|
|
63
|
+
"get_headings",
|
|
64
|
+
"get_inputs",
|
|
65
|
+
"get_interactive_elements",
|
|
66
|
+
"get_links",
|
|
67
|
+
"get_text",
|
|
68
|
+
"get_text_by_region",
|
|
69
|
+
"to_markdown",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Parse and validate SOM JSON."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Union
|
|
7
|
+
|
|
8
|
+
from pydantic import ValidationError
|
|
9
|
+
|
|
10
|
+
from .types import Som
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_som(input: Union[str, dict]) -> Som:
|
|
14
|
+
"""Parse a JSON string or dict into a validated Som object.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
input: A JSON string or a dictionary conforming to the SOM schema.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A validated Som instance.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If the input is not valid JSON.
|
|
24
|
+
ValidationError: If the input does not conform to the SOM schema.
|
|
25
|
+
"""
|
|
26
|
+
if isinstance(input, str):
|
|
27
|
+
try:
|
|
28
|
+
data = json.loads(input)
|
|
29
|
+
except json.JSONDecodeError as e:
|
|
30
|
+
raise ValueError(f"Invalid JSON: {e}") from e
|
|
31
|
+
elif isinstance(input, dict):
|
|
32
|
+
data = input
|
|
33
|
+
else:
|
|
34
|
+
raise TypeError(f"Expected str or dict, got {type(input).__name__}")
|
|
35
|
+
|
|
36
|
+
return Som.model_validate(data)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_valid_som(input: Any) -> bool:
|
|
40
|
+
"""Check if input conforms to the SOM schema.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
input: A JSON string, dict, or any other value.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
True if the input is valid SOM, False otherwise.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
parse_som(input)
|
|
50
|
+
return True
|
|
51
|
+
except (ValueError, ValidationError, TypeError):
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def from_plasmate(json_output: str) -> Som:
|
|
56
|
+
"""Parse raw Plasmate CLI JSON output into a Som object.
|
|
57
|
+
|
|
58
|
+
Plasmate CLI outputs JSON that may be the SOM directly or wrapped
|
|
59
|
+
in a container object with a ``som`` key.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
json_output: Raw JSON string from Plasmate CLI.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
A validated Som instance.
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
ValueError: If the output cannot be parsed.
|
|
69
|
+
"""
|
|
70
|
+
try:
|
|
71
|
+
data = json.loads(json_output)
|
|
72
|
+
except json.JSONDecodeError as e:
|
|
73
|
+
raise ValueError(f"Invalid JSON from Plasmate: {e}") from e
|
|
74
|
+
|
|
75
|
+
# Handle wrapped output: {"som": {...}}
|
|
76
|
+
if isinstance(data, dict) and "som" in data and "som_version" not in data:
|
|
77
|
+
data = data["som"]
|
|
78
|
+
|
|
79
|
+
return Som.model_validate(data)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Query, filter, and search utilities for SOM objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable, Dict, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from .types import ElementRole, RegionRole, Som, SomElement, SomRegion
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _collect_elements(elements: List[SomElement]) -> List[SomElement]:
|
|
11
|
+
"""Recursively collect all elements including nested children."""
|
|
12
|
+
result: List[SomElement] = []
|
|
13
|
+
for el in elements:
|
|
14
|
+
result.append(el)
|
|
15
|
+
if el.children:
|
|
16
|
+
result.extend(_collect_elements(el.children))
|
|
17
|
+
return result
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_all_elements(som: Som) -> List[SomElement]:
|
|
21
|
+
"""Flatten all elements from all regions, including nested children."""
|
|
22
|
+
result: List[SomElement] = []
|
|
23
|
+
for region in som.regions:
|
|
24
|
+
result.extend(_collect_elements(region.elements))
|
|
25
|
+
return result
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def find_by_role(som: Som, role: Union[ElementRole, str]) -> List[SomElement]:
|
|
29
|
+
"""Find all elements with a specific role.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
som: The parsed SOM object.
|
|
33
|
+
role: An ElementRole enum value or string (e.g. "link").
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(role, str):
|
|
36
|
+
role = ElementRole(role)
|
|
37
|
+
return [el for el in get_all_elements(som) if el.role == role]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def find_by_id(som: Som, id: str) -> Optional[SomElement]:
|
|
41
|
+
"""Find an element by its SOM id. Returns None if not found."""
|
|
42
|
+
for el in get_all_elements(som):
|
|
43
|
+
if el.id == id:
|
|
44
|
+
return el
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def find_by_text(
|
|
49
|
+
som: Som, text: str, *, exact: bool = False
|
|
50
|
+
) -> List[SomElement]:
|
|
51
|
+
"""Find elements containing text.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
som: The parsed SOM object.
|
|
55
|
+
text: The text to search for.
|
|
56
|
+
exact: If True, match the full text exactly (case-sensitive).
|
|
57
|
+
If False (default), case-insensitive substring match.
|
|
58
|
+
"""
|
|
59
|
+
results: List[SomElement] = []
|
|
60
|
+
for el in get_all_elements(som):
|
|
61
|
+
el_text = el.text or ""
|
|
62
|
+
el_label = el.label or ""
|
|
63
|
+
if exact:
|
|
64
|
+
if text == el_text or text == el_label:
|
|
65
|
+
results.append(el)
|
|
66
|
+
else:
|
|
67
|
+
text_lower = text.lower()
|
|
68
|
+
if text_lower in el_text.lower() or text_lower in el_label.lower():
|
|
69
|
+
results.append(el)
|
|
70
|
+
return results
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_interactive_elements(som: Som) -> List[SomElement]:
|
|
74
|
+
"""Get all elements that have actions."""
|
|
75
|
+
return [el for el in get_all_elements(som) if el.actions]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_links(som: Som) -> List[Dict[str, Optional[str]]]:
|
|
79
|
+
"""Extract all links as dicts with text, href, and id."""
|
|
80
|
+
links: List[Dict[str, Optional[str]]] = []
|
|
81
|
+
for el in find_by_role(som, ElementRole.LINK):
|
|
82
|
+
href = el.attrs.href if el.attrs else None
|
|
83
|
+
links.append({"text": el.text, "href": href, "id": el.id})
|
|
84
|
+
return links
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_forms(som: Som) -> List[SomRegion]:
|
|
88
|
+
"""Get all form regions."""
|
|
89
|
+
return [r for r in som.regions if r.role == RegionRole.FORM]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_inputs(som: Som) -> List[SomElement]:
|
|
93
|
+
"""Get all input elements (text_input, textarea, select, checkbox, radio)."""
|
|
94
|
+
input_roles = {
|
|
95
|
+
ElementRole.TEXT_INPUT,
|
|
96
|
+
ElementRole.TEXTAREA,
|
|
97
|
+
ElementRole.SELECT,
|
|
98
|
+
ElementRole.CHECKBOX,
|
|
99
|
+
ElementRole.RADIO,
|
|
100
|
+
}
|
|
101
|
+
return [el for el in get_all_elements(som) if el.role in input_roles]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_headings(som: Som) -> List[Dict[str, object]]:
|
|
105
|
+
"""Extract heading hierarchy as a list of dicts with level, text, and id."""
|
|
106
|
+
headings: List[Dict[str, object]] = []
|
|
107
|
+
for el in find_by_role(som, ElementRole.HEADING):
|
|
108
|
+
level = el.attrs.level if el.attrs and el.attrs.level is not None else 0
|
|
109
|
+
headings.append({"level": level, "text": el.text, "id": el.id})
|
|
110
|
+
return headings
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_text(som: Som) -> str:
|
|
114
|
+
"""Extract all visible text content from the SOM."""
|
|
115
|
+
parts: List[str] = []
|
|
116
|
+
for el in get_all_elements(som):
|
|
117
|
+
if el.text:
|
|
118
|
+
parts.append(el.text)
|
|
119
|
+
elif el.label:
|
|
120
|
+
parts.append(el.label)
|
|
121
|
+
return "\n".join(parts)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def get_text_by_region(som: Som) -> List[Dict[str, object]]:
|
|
125
|
+
"""Extract text grouped by region."""
|
|
126
|
+
results: List[Dict[str, object]] = []
|
|
127
|
+
for region in som.regions:
|
|
128
|
+
texts: List[str] = []
|
|
129
|
+
for el in _collect_elements(region.elements):
|
|
130
|
+
if el.text:
|
|
131
|
+
texts.append(el.text)
|
|
132
|
+
elif el.label:
|
|
133
|
+
texts.append(el.label)
|
|
134
|
+
results.append({
|
|
135
|
+
"region_id": region.id,
|
|
136
|
+
"role": region.role.value,
|
|
137
|
+
"label": region.label,
|
|
138
|
+
"text": "\n".join(texts),
|
|
139
|
+
})
|
|
140
|
+
return results
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_compression_ratio(som: Som) -> float:
|
|
144
|
+
"""Return html_bytes / som_bytes compression ratio."""
|
|
145
|
+
if som.meta.som_bytes == 0:
|
|
146
|
+
return float("inf")
|
|
147
|
+
return som.meta.html_bytes / som.meta.som_bytes
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def to_markdown(som: Som) -> str:
|
|
151
|
+
"""Convert a SOM object to readable markdown."""
|
|
152
|
+
lines: List[str] = []
|
|
153
|
+
lines.append(f"# {som.title}")
|
|
154
|
+
lines.append(f"URL: {som.url}")
|
|
155
|
+
lines.append("")
|
|
156
|
+
|
|
157
|
+
for region in som.regions:
|
|
158
|
+
role_label = region.role.value.title()
|
|
159
|
+
if region.label:
|
|
160
|
+
lines.append(f"## {role_label}: {region.label}")
|
|
161
|
+
else:
|
|
162
|
+
lines.append(f"## {role_label}")
|
|
163
|
+
lines.append("")
|
|
164
|
+
|
|
165
|
+
for el in _collect_elements(region.elements):
|
|
166
|
+
_element_to_markdown(el, lines)
|
|
167
|
+
|
|
168
|
+
lines.append("")
|
|
169
|
+
|
|
170
|
+
return "\n".join(lines)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _element_to_markdown(el: SomElement, lines: List[str]) -> None:
|
|
174
|
+
"""Append markdown for a single element."""
|
|
175
|
+
role = el.role
|
|
176
|
+
|
|
177
|
+
if role == ElementRole.HEADING:
|
|
178
|
+
level = el.attrs.level if el.attrs and el.attrs.level else 1
|
|
179
|
+
prefix = "#" * (level + 2) # offset by 2 since region is ##
|
|
180
|
+
lines.append(f"{prefix} {el.text or ''}")
|
|
181
|
+
elif role == ElementRole.PARAGRAPH:
|
|
182
|
+
lines.append(el.text or "")
|
|
183
|
+
lines.append("")
|
|
184
|
+
elif role == ElementRole.LINK:
|
|
185
|
+
href = el.attrs.href if el.attrs else "#"
|
|
186
|
+
lines.append(f"[{el.text or ''}]({href})")
|
|
187
|
+
elif role == ElementRole.BUTTON:
|
|
188
|
+
lines.append(f"[Button: {el.text or ''}]")
|
|
189
|
+
elif role == ElementRole.IMAGE:
|
|
190
|
+
alt = el.attrs.alt if el.attrs else ""
|
|
191
|
+
src = el.attrs.src if el.attrs else ""
|
|
192
|
+
lines.append(f"")
|
|
193
|
+
elif role in (ElementRole.TEXT_INPUT, ElementRole.TEXTAREA):
|
|
194
|
+
label = el.label or ""
|
|
195
|
+
placeholder = ""
|
|
196
|
+
if el.attrs and el.attrs.placeholder:
|
|
197
|
+
placeholder = f' placeholder="{el.attrs.placeholder}"'
|
|
198
|
+
lines.append(f"[Input: {label}{placeholder}]")
|
|
199
|
+
elif role == ElementRole.SELECT:
|
|
200
|
+
lines.append(f"[Select: {el.label or el.text or ''}]")
|
|
201
|
+
elif role in (ElementRole.CHECKBOX, ElementRole.RADIO):
|
|
202
|
+
checked = ""
|
|
203
|
+
if el.attrs and el.attrs.checked:
|
|
204
|
+
checked = "x"
|
|
205
|
+
lines.append(f"[{checked}] {el.text or el.label or ''}")
|
|
206
|
+
elif role == ElementRole.LIST:
|
|
207
|
+
if el.attrs and el.attrs.items:
|
|
208
|
+
for item in el.attrs.items:
|
|
209
|
+
lines.append(f"- {item.text}")
|
|
210
|
+
elif role == ElementRole.SEPARATOR:
|
|
211
|
+
lines.append("---")
|
|
212
|
+
else:
|
|
213
|
+
if el.text:
|
|
214
|
+
lines.append(el.text)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def filter_elements(
|
|
218
|
+
som: Som, predicate: Callable[[SomElement], bool]
|
|
219
|
+
) -> List[SomElement]:
|
|
220
|
+
"""Filter all elements using a predicate function."""
|
|
221
|
+
return [el for el in get_all_elements(som) if predicate(el)]
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Pydantic v2 models for SOM (Semantic Object Model)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RegionRole(str, Enum):
|
|
12
|
+
NAVIGATION = "navigation"
|
|
13
|
+
MAIN = "main"
|
|
14
|
+
ASIDE = "aside"
|
|
15
|
+
HEADER = "header"
|
|
16
|
+
FOOTER = "footer"
|
|
17
|
+
FORM = "form"
|
|
18
|
+
DIALOG = "dialog"
|
|
19
|
+
CONTENT = "content"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ElementRole(str, Enum):
|
|
23
|
+
LINK = "link"
|
|
24
|
+
BUTTON = "button"
|
|
25
|
+
TEXT_INPUT = "text_input"
|
|
26
|
+
TEXTAREA = "textarea"
|
|
27
|
+
SELECT = "select"
|
|
28
|
+
CHECKBOX = "checkbox"
|
|
29
|
+
RADIO = "radio"
|
|
30
|
+
HEADING = "heading"
|
|
31
|
+
IMAGE = "image"
|
|
32
|
+
LIST = "list"
|
|
33
|
+
TABLE = "table"
|
|
34
|
+
PARAGRAPH = "paragraph"
|
|
35
|
+
SECTION = "section"
|
|
36
|
+
SEPARATOR = "separator"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ElementAction(str, Enum):
|
|
40
|
+
CLICK = "click"
|
|
41
|
+
TYPE = "type"
|
|
42
|
+
CLEAR = "clear"
|
|
43
|
+
SELECT = "select"
|
|
44
|
+
TOGGLE = "toggle"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SemanticHint(str, Enum):
|
|
48
|
+
ACTIVE = "active"
|
|
49
|
+
BADGE = "badge"
|
|
50
|
+
CARD = "card"
|
|
51
|
+
COLLAPSED = "collapsed"
|
|
52
|
+
DANGER = "danger"
|
|
53
|
+
DISABLED = "disabled"
|
|
54
|
+
ERROR = "error"
|
|
55
|
+
EXPANDED = "expanded"
|
|
56
|
+
HERO = "hero"
|
|
57
|
+
HIDDEN = "hidden"
|
|
58
|
+
LARGE = "large"
|
|
59
|
+
LOADING = "loading"
|
|
60
|
+
MODAL = "modal"
|
|
61
|
+
NOTIFICATION = "notification"
|
|
62
|
+
PRIMARY = "primary"
|
|
63
|
+
REQUIRED = "required"
|
|
64
|
+
SECONDARY = "secondary"
|
|
65
|
+
SELECTED = "selected"
|
|
66
|
+
SMALL = "small"
|
|
67
|
+
STICKY = "sticky"
|
|
68
|
+
SUCCESS = "success"
|
|
69
|
+
WARNING = "warning"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class SelectOption(BaseModel):
|
|
73
|
+
value: str
|
|
74
|
+
text: str
|
|
75
|
+
selected: Optional[bool] = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ListItem(BaseModel):
|
|
79
|
+
text: str
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class SomElementAttrs(BaseModel):
|
|
83
|
+
href: Optional[str] = None
|
|
84
|
+
input_type: Optional[str] = None
|
|
85
|
+
value: Optional[str] = None
|
|
86
|
+
placeholder: Optional[str] = None
|
|
87
|
+
required: Optional[bool] = None
|
|
88
|
+
disabled: Optional[bool] = None
|
|
89
|
+
checked: Optional[bool] = None
|
|
90
|
+
group: Optional[str] = None
|
|
91
|
+
multiple: Optional[bool] = None
|
|
92
|
+
options: Optional[List[SelectOption]] = None
|
|
93
|
+
level: Optional[int] = None
|
|
94
|
+
alt: Optional[str] = None
|
|
95
|
+
src: Optional[str] = None
|
|
96
|
+
ordered: Optional[bool] = None
|
|
97
|
+
items: Optional[List[ListItem]] = None
|
|
98
|
+
headers: Optional[List[str]] = None
|
|
99
|
+
rows: Optional[List[List[str]]] = None
|
|
100
|
+
section_label: Optional[str] = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class SomElement(BaseModel):
|
|
104
|
+
id: str
|
|
105
|
+
role: ElementRole
|
|
106
|
+
text: Optional[str] = None
|
|
107
|
+
label: Optional[str] = None
|
|
108
|
+
actions: Optional[List[ElementAction]] = None
|
|
109
|
+
attrs: Optional[SomElementAttrs] = None
|
|
110
|
+
children: Optional[List[SomElement]] = None
|
|
111
|
+
hints: Optional[List[SemanticHint]] = None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class SomRegion(BaseModel):
|
|
115
|
+
id: str
|
|
116
|
+
role: RegionRole
|
|
117
|
+
label: Optional[str] = None
|
|
118
|
+
action: Optional[str] = None
|
|
119
|
+
method: Optional[str] = None
|
|
120
|
+
elements: List[SomElement]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class SomMeta(BaseModel):
|
|
124
|
+
html_bytes: int
|
|
125
|
+
som_bytes: int
|
|
126
|
+
element_count: int
|
|
127
|
+
interactive_count: int
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class LinkElement(BaseModel):
|
|
131
|
+
rel: str
|
|
132
|
+
href: str
|
|
133
|
+
type: Optional[str] = None
|
|
134
|
+
hreflang: Optional[str] = None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class StructuredData(BaseModel):
|
|
138
|
+
json_ld: Optional[List[Dict[str, Any]]] = None
|
|
139
|
+
open_graph: Optional[Dict[str, str]] = None
|
|
140
|
+
twitter_card: Optional[Dict[str, str]] = None
|
|
141
|
+
meta: Optional[Dict[str, str]] = None
|
|
142
|
+
links: Optional[List[LinkElement]] = None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class Som(BaseModel):
|
|
146
|
+
som_version: str
|
|
147
|
+
url: str
|
|
148
|
+
title: str
|
|
149
|
+
lang: str
|
|
150
|
+
regions: List[SomRegion]
|
|
151
|
+
meta: SomMeta
|
|
152
|
+
structured_data: Optional[StructuredData] = None
|
|
File without changes
|
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
"""Tests for som-parser package."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from som_parser import (
|
|
8
|
+
ElementRole,
|
|
9
|
+
RegionRole,
|
|
10
|
+
Som,
|
|
11
|
+
SomElement,
|
|
12
|
+
filter_elements,
|
|
13
|
+
find_by_id,
|
|
14
|
+
find_by_role,
|
|
15
|
+
find_by_text,
|
|
16
|
+
from_plasmate,
|
|
17
|
+
get_all_elements,
|
|
18
|
+
get_compression_ratio,
|
|
19
|
+
get_forms,
|
|
20
|
+
get_headings,
|
|
21
|
+
get_inputs,
|
|
22
|
+
get_interactive_elements,
|
|
23
|
+
get_links,
|
|
24
|
+
get_text,
|
|
25
|
+
get_text_by_region,
|
|
26
|
+
is_valid_som,
|
|
27
|
+
parse_som,
|
|
28
|
+
to_markdown,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
FIXTURE_SOM = {
|
|
32
|
+
"som_version": "0.1",
|
|
33
|
+
"url": "https://example.com/",
|
|
34
|
+
"title": "Example Domain",
|
|
35
|
+
"lang": "en",
|
|
36
|
+
"regions": [
|
|
37
|
+
{
|
|
38
|
+
"id": "r_nav",
|
|
39
|
+
"role": "navigation",
|
|
40
|
+
"elements": [
|
|
41
|
+
{
|
|
42
|
+
"id": "e_1",
|
|
43
|
+
"role": "link",
|
|
44
|
+
"text": "Home",
|
|
45
|
+
"actions": ["click"],
|
|
46
|
+
"attrs": {"href": "/"},
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"id": "e_2",
|
|
50
|
+
"role": "link",
|
|
51
|
+
"text": "About",
|
|
52
|
+
"actions": ["click"],
|
|
53
|
+
"attrs": {"href": "/about"},
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"id": "r_content",
|
|
59
|
+
"role": "content",
|
|
60
|
+
"elements": [
|
|
61
|
+
{
|
|
62
|
+
"id": "e_3",
|
|
63
|
+
"role": "heading",
|
|
64
|
+
"text": "Welcome",
|
|
65
|
+
"attrs": {"level": 1},
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"id": "e_4",
|
|
69
|
+
"role": "paragraph",
|
|
70
|
+
"text": "This is a test page.",
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"id": "e_5",
|
|
74
|
+
"role": "link",
|
|
75
|
+
"text": "Learn more",
|
|
76
|
+
"actions": ["click"],
|
|
77
|
+
"attrs": {"href": "https://example.org"},
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"id": "e_6",
|
|
81
|
+
"role": "image",
|
|
82
|
+
"attrs": {"src": "/logo.png", "alt": "Logo"},
|
|
83
|
+
},
|
|
84
|
+
],
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"id": "r_form",
|
|
88
|
+
"role": "form",
|
|
89
|
+
"action": "/search",
|
|
90
|
+
"method": "GET",
|
|
91
|
+
"elements": [
|
|
92
|
+
{
|
|
93
|
+
"id": "e_7",
|
|
94
|
+
"role": "text_input",
|
|
95
|
+
"label": "Search",
|
|
96
|
+
"actions": ["type", "clear"],
|
|
97
|
+
"attrs": {"input_type": "text", "placeholder": "Search..."},
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"id": "e_8",
|
|
101
|
+
"role": "button",
|
|
102
|
+
"text": "Go",
|
|
103
|
+
"actions": ["click"],
|
|
104
|
+
},
|
|
105
|
+
],
|
|
106
|
+
},
|
|
107
|
+
],
|
|
108
|
+
"meta": {
|
|
109
|
+
"html_bytes": 5000,
|
|
110
|
+
"som_bytes": 800,
|
|
111
|
+
"element_count": 8,
|
|
112
|
+
"interactive_count": 5,
|
|
113
|
+
},
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@pytest.fixture
|
|
118
|
+
def som() -> Som:
|
|
119
|
+
return parse_som(FIXTURE_SOM)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@pytest.fixture
|
|
123
|
+
def som_json() -> str:
|
|
124
|
+
return json.dumps(FIXTURE_SOM)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# --- Parser tests ---
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class TestParseSom:
|
|
131
|
+
def test_parse_dict(self, som: Som):
|
|
132
|
+
assert isinstance(som, Som)
|
|
133
|
+
assert som.title == "Example Domain"
|
|
134
|
+
assert som.url == "https://example.com/"
|
|
135
|
+
assert som.som_version == "0.1"
|
|
136
|
+
assert som.lang == "en"
|
|
137
|
+
assert len(som.regions) == 3
|
|
138
|
+
|
|
139
|
+
def test_parse_json_string(self, som_json: str):
|
|
140
|
+
result = parse_som(som_json)
|
|
141
|
+
assert isinstance(result, Som)
|
|
142
|
+
assert result.title == "Example Domain"
|
|
143
|
+
|
|
144
|
+
def test_parse_invalid_json_string(self):
|
|
145
|
+
with pytest.raises(ValueError, match="Invalid JSON"):
|
|
146
|
+
parse_som("not valid json {{{")
|
|
147
|
+
|
|
148
|
+
def test_parse_invalid_schema(self):
|
|
149
|
+
with pytest.raises(Exception):
|
|
150
|
+
parse_som({"not": "a som"})
|
|
151
|
+
|
|
152
|
+
def test_parse_wrong_type(self):
|
|
153
|
+
with pytest.raises(TypeError):
|
|
154
|
+
parse_som(42) # type: ignore
|
|
155
|
+
|
|
156
|
+
def test_regions_parsed(self, som: Som):
|
|
157
|
+
assert som.regions[0].role == RegionRole.NAVIGATION
|
|
158
|
+
assert som.regions[1].role == RegionRole.CONTENT
|
|
159
|
+
assert som.regions[2].role == RegionRole.FORM
|
|
160
|
+
|
|
161
|
+
def test_elements_parsed(self, som: Som):
|
|
162
|
+
nav_elements = som.regions[0].elements
|
|
163
|
+
assert len(nav_elements) == 2
|
|
164
|
+
assert nav_elements[0].role == ElementRole.LINK
|
|
165
|
+
assert nav_elements[0].text == "Home"
|
|
166
|
+
|
|
167
|
+
def test_meta_parsed(self, som: Som):
|
|
168
|
+
assert som.meta.html_bytes == 5000
|
|
169
|
+
assert som.meta.som_bytes == 800
|
|
170
|
+
assert som.meta.element_count == 8
|
|
171
|
+
assert som.meta.interactive_count == 5
|
|
172
|
+
|
|
173
|
+
def test_form_region_attrs(self, som: Som):
|
|
174
|
+
form = som.regions[2]
|
|
175
|
+
assert form.action == "/search"
|
|
176
|
+
assert form.method == "GET"
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class TestIsValidSom:
|
|
180
|
+
def test_valid(self):
|
|
181
|
+
assert is_valid_som(FIXTURE_SOM) is True
|
|
182
|
+
|
|
183
|
+
def test_valid_string(self):
|
|
184
|
+
assert is_valid_som(json.dumps(FIXTURE_SOM)) is True
|
|
185
|
+
|
|
186
|
+
def test_invalid_dict(self):
|
|
187
|
+
assert is_valid_som({"bad": "data"}) is False
|
|
188
|
+
|
|
189
|
+
def test_invalid_string(self):
|
|
190
|
+
assert is_valid_som("nope") is False
|
|
191
|
+
|
|
192
|
+
def test_invalid_type(self):
|
|
193
|
+
assert is_valid_som(123) is False
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class TestFromPlasmate:
|
|
197
|
+
def test_direct_som(self):
|
|
198
|
+
result = from_plasmate(json.dumps(FIXTURE_SOM))
|
|
199
|
+
assert result.title == "Example Domain"
|
|
200
|
+
|
|
201
|
+
def test_wrapped_som(self):
|
|
202
|
+
wrapped = json.dumps({"som": FIXTURE_SOM})
|
|
203
|
+
result = from_plasmate(wrapped)
|
|
204
|
+
assert result.title == "Example Domain"
|
|
205
|
+
|
|
206
|
+
def test_invalid_json(self):
|
|
207
|
+
with pytest.raises(ValueError, match="Invalid JSON"):
|
|
208
|
+
from_plasmate("not json")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# --- Query tests ---
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class TestGetAllElements:
|
|
215
|
+
def test_count(self, som: Som):
|
|
216
|
+
elements = get_all_elements(som)
|
|
217
|
+
assert len(elements) == 8
|
|
218
|
+
|
|
219
|
+
def test_all_have_ids(self, som: Som):
|
|
220
|
+
elements = get_all_elements(som)
|
|
221
|
+
ids = [el.id for el in elements]
|
|
222
|
+
assert ids == ["e_1", "e_2", "e_3", "e_4", "e_5", "e_6", "e_7", "e_8"]
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class TestFindByRole:
|
|
226
|
+
def test_links(self, som: Som):
|
|
227
|
+
links = find_by_role(som, ElementRole.LINK)
|
|
228
|
+
assert len(links) == 3
|
|
229
|
+
|
|
230
|
+
def test_string_role(self, som: Som):
|
|
231
|
+
links = find_by_role(som, "link")
|
|
232
|
+
assert len(links) == 3
|
|
233
|
+
|
|
234
|
+
def test_headings(self, som: Som):
|
|
235
|
+
headings = find_by_role(som, ElementRole.HEADING)
|
|
236
|
+
assert len(headings) == 1
|
|
237
|
+
assert headings[0].text == "Welcome"
|
|
238
|
+
|
|
239
|
+
def test_buttons(self, som: Som):
|
|
240
|
+
buttons = find_by_role(som, ElementRole.BUTTON)
|
|
241
|
+
assert len(buttons) == 1
|
|
242
|
+
assert buttons[0].text == "Go"
|
|
243
|
+
|
|
244
|
+
def test_no_results(self, som: Som):
|
|
245
|
+
tables = find_by_role(som, ElementRole.TABLE)
|
|
246
|
+
assert tables == []
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class TestFindById:
|
|
250
|
+
def test_found(self, som: Som):
|
|
251
|
+
el = find_by_id(som, "e_3")
|
|
252
|
+
assert el is not None
|
|
253
|
+
assert el.text == "Welcome"
|
|
254
|
+
assert el.role == ElementRole.HEADING
|
|
255
|
+
|
|
256
|
+
def test_not_found(self, som: Som):
|
|
257
|
+
assert find_by_id(som, "nonexistent") is None
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class TestFindByText:
|
|
261
|
+
def test_substring(self, som: Som):
|
|
262
|
+
results = find_by_text(som, "home")
|
|
263
|
+
assert len(results) == 1
|
|
264
|
+
assert results[0].id == "e_1"
|
|
265
|
+
|
|
266
|
+
def test_case_insensitive(self, som: Som):
|
|
267
|
+
results = find_by_text(som, "WELCOME")
|
|
268
|
+
assert len(results) == 1
|
|
269
|
+
|
|
270
|
+
def test_exact_match(self, som: Som):
|
|
271
|
+
results = find_by_text(som, "Home", exact=True)
|
|
272
|
+
assert len(results) == 1
|
|
273
|
+
|
|
274
|
+
def test_exact_no_match(self, som: Som):
|
|
275
|
+
results = find_by_text(som, "home", exact=True)
|
|
276
|
+
assert len(results) == 0
|
|
277
|
+
|
|
278
|
+
def test_label_match(self, som: Som):
|
|
279
|
+
results = find_by_text(som, "search")
|
|
280
|
+
assert len(results) == 1
|
|
281
|
+
assert results[0].id == "e_7"
|
|
282
|
+
|
|
283
|
+
def test_no_match(self, som: Som):
|
|
284
|
+
results = find_by_text(som, "xyznonexistent")
|
|
285
|
+
assert len(results) == 0
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class TestGetInteractiveElements:
|
|
289
|
+
def test_count(self, som: Som):
|
|
290
|
+
interactive = get_interactive_elements(som)
|
|
291
|
+
assert len(interactive) == 5
|
|
292
|
+
|
|
293
|
+
def test_all_have_actions(self, som: Som):
|
|
294
|
+
interactive = get_interactive_elements(som)
|
|
295
|
+
for el in interactive:
|
|
296
|
+
assert el.actions is not None
|
|
297
|
+
assert len(el.actions) > 0
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class TestGetLinks:
|
|
301
|
+
def test_links(self, som: Som):
|
|
302
|
+
links = get_links(som)
|
|
303
|
+
assert len(links) == 3
|
|
304
|
+
assert links[0] == {"text": "Home", "href": "/", "id": "e_1"}
|
|
305
|
+
assert links[1] == {"text": "About", "href": "/about", "id": "e_2"}
|
|
306
|
+
assert links[2] == {
|
|
307
|
+
"text": "Learn more",
|
|
308
|
+
"href": "https://example.org",
|
|
309
|
+
"id": "e_5",
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class TestGetForms:
|
|
314
|
+
def test_forms(self, som: Som):
|
|
315
|
+
forms = get_forms(som)
|
|
316
|
+
assert len(forms) == 1
|
|
317
|
+
assert forms[0].id == "r_form"
|
|
318
|
+
assert forms[0].action == "/search"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class TestGetInputs:
|
|
322
|
+
def test_inputs(self, som: Som):
|
|
323
|
+
inputs = get_inputs(som)
|
|
324
|
+
assert len(inputs) == 1
|
|
325
|
+
assert inputs[0].id == "e_7"
|
|
326
|
+
assert inputs[0].role == ElementRole.TEXT_INPUT
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class TestGetHeadings:
|
|
330
|
+
def test_headings(self, som: Som):
|
|
331
|
+
headings = get_headings(som)
|
|
332
|
+
assert len(headings) == 1
|
|
333
|
+
assert headings[0] == {"level": 1, "text": "Welcome", "id": "e_3"}
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
class TestGetText:
|
|
337
|
+
def test_text(self, som: Som):
|
|
338
|
+
text = get_text(som)
|
|
339
|
+
assert "Home" in text
|
|
340
|
+
assert "About" in text
|
|
341
|
+
assert "Welcome" in text
|
|
342
|
+
assert "This is a test page." in text
|
|
343
|
+
assert "Learn more" in text
|
|
344
|
+
assert "Search" in text
|
|
345
|
+
assert "Go" in text
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class TestGetTextByRegion:
|
|
349
|
+
def test_regions(self, som: Som):
|
|
350
|
+
regions = get_text_by_region(som)
|
|
351
|
+
assert len(regions) == 3
|
|
352
|
+
assert regions[0]["region_id"] == "r_nav"
|
|
353
|
+
assert regions[0]["role"] == "navigation"
|
|
354
|
+
assert "Home" in regions[0]["text"]
|
|
355
|
+
|
|
356
|
+
def test_content_region(self, som: Som):
|
|
357
|
+
regions = get_text_by_region(som)
|
|
358
|
+
content = regions[1]
|
|
359
|
+
assert content["role"] == "content"
|
|
360
|
+
assert "Welcome" in content["text"]
|
|
361
|
+
assert "This is a test page." in content["text"]
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
class TestGetCompressionRatio:
|
|
365
|
+
def test_ratio(self, som: Som):
|
|
366
|
+
ratio = get_compression_ratio(som)
|
|
367
|
+
assert ratio == 5000 / 800
|
|
368
|
+
assert ratio == 6.25
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
class TestToMarkdown:
|
|
372
|
+
def test_contains_title(self, som: Som):
|
|
373
|
+
md = to_markdown(som)
|
|
374
|
+
assert "# Example Domain" in md
|
|
375
|
+
|
|
376
|
+
def test_contains_url(self, som: Som):
|
|
377
|
+
md = to_markdown(som)
|
|
378
|
+
assert "URL: https://example.com/" in md
|
|
379
|
+
|
|
380
|
+
def test_contains_regions(self, som: Som):
|
|
381
|
+
md = to_markdown(som)
|
|
382
|
+
assert "## Navigation" in md
|
|
383
|
+
assert "## Content" in md
|
|
384
|
+
assert "## Form" in md
|
|
385
|
+
|
|
386
|
+
def test_contains_links(self, som: Som):
|
|
387
|
+
md = to_markdown(som)
|
|
388
|
+
assert "[Home](/)" in md
|
|
389
|
+
assert "[About](/about)" in md
|
|
390
|
+
|
|
391
|
+
def test_contains_heading(self, som: Som):
|
|
392
|
+
md = to_markdown(som)
|
|
393
|
+
assert "### Welcome" in md
|
|
394
|
+
|
|
395
|
+
def test_contains_paragraph(self, som: Som):
|
|
396
|
+
md = to_markdown(som)
|
|
397
|
+
assert "This is a test page." in md
|
|
398
|
+
|
|
399
|
+
def test_contains_image(self, som: Som):
|
|
400
|
+
md = to_markdown(som)
|
|
401
|
+
assert "" in md
|
|
402
|
+
|
|
403
|
+
def test_contains_button(self, som: Som):
|
|
404
|
+
md = to_markdown(som)
|
|
405
|
+
assert "[Button: Go]" in md
|
|
406
|
+
|
|
407
|
+
def test_contains_input(self, som: Som):
|
|
408
|
+
md = to_markdown(som)
|
|
409
|
+
assert "Input: Search" in md
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class TestFilterElements:
|
|
413
|
+
def test_filter_by_actions(self, som: Som):
|
|
414
|
+
clickable = filter_elements(
|
|
415
|
+
som, lambda el: el.actions is not None and "click" in [a.value for a in el.actions]
|
|
416
|
+
)
|
|
417
|
+
assert len(clickable) == 4 # 3 links + 1 button
|
|
418
|
+
|
|
419
|
+
def test_filter_by_text(self, som: Som):
|
|
420
|
+
with_text = filter_elements(som, lambda el: el.text is not None)
|
|
421
|
+
assert len(with_text) == 6 # all except image and text_input
|