som-parser 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ /target
2
+ *.swp
3
+ *.swo
4
+ *~
5
+ .DS_Store
6
+ report.md
7
+ .claude/settings.local.json
8
+ smoke/node_modules/
9
+ sdk/node/node_modules/
10
+ sdk/node/dist/
11
+ sdk/python/*.egg-info/
12
+ sdk/python/dist/
13
+ sdk/python/build/
14
+ __pycache__/
15
+ *.pyc
16
+
17
+ # MCP registry publisher tokens
18
+ .mcpregistry_*
19
+ .vercel
20
+ packages/*/node_modules/
21
+ packages/*/.venv/
22
+ packages/*/dist/
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: som-parser
3
+ Version: 0.3.0
4
+ Summary: Parse and query SOM (Semantic Object Model) - the structured web format for AI agents
5
+ Project-URL: Homepage, https://plasmate.app
6
+ Project-URL: Documentation, https://plasmate.app/docs/som-spec
7
+ Project-URL: Repository, https://github.com/plasmate-labs/plasmate
8
+ License-Expression: Apache-2.0
9
+ Keywords: ai-agents,browser-automation,plasmate,semantic-object-model,som,web-scraping
10
+ Requires-Python: >=3.9
11
+ Requires-Dist: pydantic>=2.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=7.0; extra == 'dev'
14
+ Description-Content-Type: text/markdown
15
+
16
+ # som-parser
17
+
18
+ Parse and query [SOM (Semantic Object Model)](https://plasmate.app/docs/som-spec) output in Python. SOM is a structured JSON format that represents web pages as semantic regions and elements, designed for AI agents, browser automation, and web scraping. This library provides Pydantic v2 models for type-safe parsing, validation, and a rich set of query utilities to extract exactly what you need.
19
+
20
+ ## Install
21
+
22
+ ```bash
23
+ pip install som-parser
24
+ ```
25
+
26
+ ## Quick Start
27
+
28
+ ### Parse Plasmate output
29
+
30
+ ```python
31
+ import subprocess
32
+ from som_parser import parse_som, from_plasmate
33
+
34
+ # Parse a SOM JSON string or dict
35
+ som = parse_som('{"som_version": "0.1", ...}')
36
+
37
+ # Or parse raw Plasmate CLI output directly
38
+ result = subprocess.run(["plasmate", "https://example.com"], capture_output=True, text=True)
39
+ som = from_plasmate(result.stdout)
40
+
41
+ print(som.title) # "Example Domain"
42
+ print(som.url) # "https://example.com/"
43
+ print(som.som_version) # "0.1"
44
+ ```
45
+
46
+ ### Find links
47
+
48
+ ```python
49
+ from som_parser import parse_som, get_links, find_by_role
50
+
51
+ som = parse_som(data)
52
+
53
+ # Get all links as simple dicts
54
+ for link in get_links(som):
55
+ print(f"{link['text']} -> {link['href']}")
56
+
57
+ # Or find by role for full SomElement objects
58
+ for el in find_by_role(som, "link"):
59
+ print(el.id, el.text, el.attrs.href)
60
+ ```
61
+
62
+ ### Get interactive elements
63
+
64
+ ```python
65
+ from som_parser import parse_som, get_interactive_elements
66
+
67
+ som = parse_som(data)
68
+ for el in get_interactive_elements(som):
69
+ print(f"{el.id}: {el.role.value} - actions: {[a.value for a in el.actions]}")
70
+ ```
71
+
72
+ ### Convert to markdown
73
+
74
+ ```python
75
+ from som_parser import parse_som, to_markdown
76
+
77
+ som = parse_som(data)
78
+ print(to_markdown(som))
79
+ ```
80
+
81
+ ### Use Pydantic models directly
82
+
83
+ ```python
84
+ from som_parser import Som, SomElement, ElementRole
85
+
86
+ # Validate and construct from a dict
87
+ som = Som.model_validate(my_dict)
88
+
89
+ # Access typed fields
90
+ for region in som.regions:
91
+ for element in region.elements:
92
+ if element.role == ElementRole.LINK:
93
+ print(element.attrs.href)
94
+
95
+ # Serialize back to JSON
96
+ print(som.model_dump_json(indent=2))
97
+ ```
98
+
99
+ ## API Reference
100
+
101
+ ### Parser
102
+
103
+ | Function | Description |
104
+ |----------|-------------|
105
+ | `parse_som(input: str \| dict) -> Som` | Parse JSON string or dict into a validated Som object |
106
+ | `is_valid_som(input) -> bool` | Check if input conforms to the SOM schema |
107
+ | `from_plasmate(json_output: str) -> Som` | Parse raw Plasmate CLI JSON output |
108
+
109
+ ### Query Utilities
110
+
111
+ | Function | Description |
112
+ |----------|-------------|
113
+ | `get_all_elements(som) -> list[SomElement]` | Flatten all elements from all regions |
114
+ | `find_by_role(som, role) -> list[SomElement]` | Find elements by role (enum or string) |
115
+ | `find_by_id(som, id) -> SomElement \| None` | Find a single element by its SOM id |
116
+ | `find_by_text(som, text, exact=False) -> list[SomElement]` | Search elements by text content |
117
+ | `get_interactive_elements(som) -> list[SomElement]` | Get elements that have actions |
118
+ | `get_links(som) -> list[dict]` | Extract all links as `{text, href, id}` dicts |
119
+ | `get_forms(som) -> list[SomRegion]` | Get all form regions |
120
+ | `get_inputs(som) -> list[SomElement]` | Get all input elements |
121
+ | `get_headings(som) -> list[dict]` | Extract heading hierarchy as `{level, text, id}` |
122
+ | `get_text(som) -> str` | Extract all visible text content |
123
+ | `get_text_by_region(som) -> list[dict]` | Extract text grouped by region |
124
+ | `get_compression_ratio(som) -> float` | Return `html_bytes / som_bytes` |
125
+ | `to_markdown(som) -> str` | Convert SOM to readable markdown |
126
+ | `filter_elements(som, predicate) -> list[SomElement]` | Generic filter with a callable |
127
+
128
+ ### Types
129
+
130
+ All Pydantic v2 models are exported from the top level:
131
+
132
+ - `Som`, `SomRegion`, `SomElement`, `SomElementAttrs`, `SomMeta`
133
+ - `StructuredData`, `LinkElement`, `SelectOption`, `ListItem`
134
+ - `RegionRole`, `ElementRole`, `ElementAction`, `SemanticHint` (enums)
135
+
136
+ ## Links
137
+
138
+ - [SOM Spec](https://plasmate.app/docs/som-spec)
139
+ - [Plasmate](https://plasmate.app)
140
+ - [GitHub Repository](https://github.com/plasmate-labs/plasmate)
141
+
142
+ ## License
143
+
144
+ Apache-2.0
@@ -0,0 +1,129 @@
1
+ # som-parser
2
+
3
+ Parse and query [SOM (Semantic Object Model)](https://plasmate.app/docs/som-spec) output in Python. SOM is a structured JSON format that represents web pages as semantic regions and elements, designed for AI agents, browser automation, and web scraping. This library provides Pydantic v2 models for type-safe parsing, validation, and a rich set of query utilities to extract exactly what you need.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install som-parser
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ### Parse Plasmate output
14
+
15
+ ```python
16
+ import subprocess
17
+ from som_parser import parse_som, from_plasmate
18
+
19
+ # Parse a SOM JSON string or dict
20
+ som = parse_som('{"som_version": "0.1", ...}')
21
+
22
+ # Or parse raw Plasmate CLI output directly
23
+ result = subprocess.run(["plasmate", "https://example.com"], capture_output=True, text=True)
24
+ som = from_plasmate(result.stdout)
25
+
26
+ print(som.title) # "Example Domain"
27
+ print(som.url) # "https://example.com/"
28
+ print(som.som_version) # "0.1"
29
+ ```
30
+
31
+ ### Find links
32
+
33
+ ```python
34
+ from som_parser import parse_som, get_links, find_by_role
35
+
36
+ som = parse_som(data)
37
+
38
+ # Get all links as simple dicts
39
+ for link in get_links(som):
40
+ print(f"{link['text']} -> {link['href']}")
41
+
42
+ # Or find by role for full SomElement objects
43
+ for el in find_by_role(som, "link"):
44
+ print(el.id, el.text, el.attrs.href)
45
+ ```
46
+
47
+ ### Get interactive elements
48
+
49
+ ```python
50
+ from som_parser import parse_som, get_interactive_elements
51
+
52
+ som = parse_som(data)
53
+ for el in get_interactive_elements(som):
54
+ print(f"{el.id}: {el.role.value} - actions: {[a.value for a in el.actions]}")
55
+ ```
56
+
57
+ ### Convert to markdown
58
+
59
+ ```python
60
+ from som_parser import parse_som, to_markdown
61
+
62
+ som = parse_som(data)
63
+ print(to_markdown(som))
64
+ ```
65
+
66
+ ### Use Pydantic models directly
67
+
68
+ ```python
69
+ from som_parser import Som, SomElement, ElementRole
70
+
71
+ # Validate and construct from a dict
72
+ som = Som.model_validate(my_dict)
73
+
74
+ # Access typed fields
75
+ for region in som.regions:
76
+ for element in region.elements:
77
+ if element.role == ElementRole.LINK:
78
+ print(element.attrs.href)
79
+
80
+ # Serialize back to JSON
81
+ print(som.model_dump_json(indent=2))
82
+ ```
83
+
84
+ ## API Reference
85
+
86
+ ### Parser
87
+
88
+ | Function | Description |
89
+ |----------|-------------|
90
+ | `parse_som(input: str \| dict) -> Som` | Parse JSON string or dict into a validated Som object |
91
+ | `is_valid_som(input) -> bool` | Check if input conforms to the SOM schema |
92
+ | `from_plasmate(json_output: str) -> Som` | Parse raw Plasmate CLI JSON output |
93
+
94
+ ### Query Utilities
95
+
96
+ | Function | Description |
97
+ |----------|-------------|
98
+ | `get_all_elements(som) -> list[SomElement]` | Flatten all elements from all regions |
99
+ | `find_by_role(som, role) -> list[SomElement]` | Find elements by role (enum or string) |
100
+ | `find_by_id(som, id) -> SomElement \| None` | Find a single element by its SOM id |
101
+ | `find_by_text(som, text, exact=False) -> list[SomElement]` | Search elements by text content |
102
+ | `get_interactive_elements(som) -> list[SomElement]` | Get elements that have actions |
103
+ | `get_links(som) -> list[dict]` | Extract all links as `{text, href, id}` dicts |
104
+ | `get_forms(som) -> list[SomRegion]` | Get all form regions |
105
+ | `get_inputs(som) -> list[SomElement]` | Get all input elements |
106
+ | `get_headings(som) -> list[dict]` | Extract heading hierarchy as `{level, text, id}` |
107
+ | `get_text(som) -> str` | Extract all visible text content |
108
+ | `get_text_by_region(som) -> list[dict]` | Extract text grouped by region |
109
+ | `get_compression_ratio(som) -> float` | Return `html_bytes / som_bytes` |
110
+ | `to_markdown(som) -> str` | Convert SOM to readable markdown |
111
+ | `filter_elements(som, predicate) -> list[SomElement]` | Generic filter with a callable |
112
+
113
+ ### Types
114
+
115
+ All Pydantic v2 models are exported from the top level:
116
+
117
+ - `Som`, `SomRegion`, `SomElement`, `SomElementAttrs`, `SomMeta`
118
+ - `StructuredData`, `LinkElement`, `SelectOption`, `ListItem`
119
+ - `RegionRole`, `ElementRole`, `ElementAction`, `SemanticHint` (enums)
120
+
121
+ ## Links
122
+
123
+ - [SOM Spec](https://plasmate.app/docs/som-spec)
124
+ - [Plasmate](https://plasmate.app)
125
+ - [GitHub Repository](https://github.com/plasmate-labs/plasmate)
126
+
127
+ ## License
128
+
129
+ Apache-2.0
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "som-parser"
7
+ version = "0.3.0"
8
+ description = "Parse and query SOM (Semantic Object Model) - the structured web format for AI agents"
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.9"
12
+ dependencies = ["pydantic>=2.0"]
13
+ keywords = ["som", "semantic-object-model", "web-scraping", "ai-agents", "browser-automation", "plasmate"]
14
+
15
+ [project.optional-dependencies]
16
+ dev = ["pytest>=7.0"]
17
+
18
+ [project.urls]
19
+ Homepage = "https://plasmate.app"
20
+ Documentation = "https://plasmate.app/docs/som-spec"
21
+ Repository = "https://github.com/plasmate-labs/plasmate"
22
+
23
+ [tool.pytest.ini_options]
24
+ testpaths = ["tests"]
@@ -0,0 +1,72 @@
1
+ """som-parser: Parse and query SOM (Semantic Object Model) output."""
2
+
3
+ from .types import (
4
+ ElementAction,
5
+ ElementRole,
6
+ LinkElement,
7
+ ListItem,
8
+ RegionRole,
9
+ SelectOption,
10
+ SemanticHint,
11
+ Som,
12
+ SomElement,
13
+ SomElementAttrs,
14
+ SomMeta,
15
+ SomRegion,
16
+ StructuredData,
17
+ )
18
+ from .parser import from_plasmate, is_valid_som, parse_som
19
+ from .query import (
20
+ filter_elements,
21
+ find_by_id,
22
+ find_by_role,
23
+ find_by_text,
24
+ get_all_elements,
25
+ get_compression_ratio,
26
+ get_forms,
27
+ get_headings,
28
+ get_inputs,
29
+ get_interactive_elements,
30
+ get_links,
31
+ get_text,
32
+ get_text_by_region,
33
+ to_markdown,
34
+ )
35
+
36
+ __all__ = [
37
+ # Types
38
+ "ElementAction",
39
+ "ElementRole",
40
+ "LinkElement",
41
+ "ListItem",
42
+ "RegionRole",
43
+ "SelectOption",
44
+ "SemanticHint",
45
+ "Som",
46
+ "SomElement",
47
+ "SomElementAttrs",
48
+ "SomMeta",
49
+ "SomRegion",
50
+ "StructuredData",
51
+ # Parser
52
+ "from_plasmate",
53
+ "is_valid_som",
54
+ "parse_som",
55
+ # Query
56
+ "filter_elements",
57
+ "find_by_id",
58
+ "find_by_role",
59
+ "find_by_text",
60
+ "get_all_elements",
61
+ "get_compression_ratio",
62
+ "get_forms",
63
+ "get_headings",
64
+ "get_inputs",
65
+ "get_interactive_elements",
66
+ "get_links",
67
+ "get_text",
68
+ "get_text_by_region",
69
+ "to_markdown",
70
+ ]
71
+
72
+ __version__ = "0.3.0"
@@ -0,0 +1,79 @@
1
+ """Parse and validate SOM JSON."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any, Union
7
+
8
+ from pydantic import ValidationError
9
+
10
+ from .types import Som
11
+
12
+
13
+ def parse_som(input: Union[str, dict]) -> Som:
14
+ """Parse a JSON string or dict into a validated Som object.
15
+
16
+ Args:
17
+ input: A JSON string or a dictionary conforming to the SOM schema.
18
+
19
+ Returns:
20
+ A validated Som instance.
21
+
22
+ Raises:
23
+ ValueError: If the input is not valid JSON.
24
+ ValidationError: If the input does not conform to the SOM schema.
25
+ """
26
+ if isinstance(input, str):
27
+ try:
28
+ data = json.loads(input)
29
+ except json.JSONDecodeError as e:
30
+ raise ValueError(f"Invalid JSON: {e}") from e
31
+ elif isinstance(input, dict):
32
+ data = input
33
+ else:
34
+ raise TypeError(f"Expected str or dict, got {type(input).__name__}")
35
+
36
+ return Som.model_validate(data)
37
+
38
+
39
+ def is_valid_som(input: Any) -> bool:
40
+ """Check if input conforms to the SOM schema.
41
+
42
+ Args:
43
+ input: A JSON string, dict, or any other value.
44
+
45
+ Returns:
46
+ True if the input is valid SOM, False otherwise.
47
+ """
48
+ try:
49
+ parse_som(input)
50
+ return True
51
+ except (ValueError, ValidationError, TypeError):
52
+ return False
53
+
54
+
55
+ def from_plasmate(json_output: str) -> Som:
56
+ """Parse raw Plasmate CLI JSON output into a Som object.
57
+
58
+ Plasmate CLI outputs JSON that may be the SOM directly or wrapped
59
+ in a container object with a ``som`` key.
60
+
61
+ Args:
62
+ json_output: Raw JSON string from Plasmate CLI.
63
+
64
+ Returns:
65
+ A validated Som instance.
66
+
67
+ Raises:
68
+ ValueError: If the output cannot be parsed.
69
+ """
70
+ try:
71
+ data = json.loads(json_output)
72
+ except json.JSONDecodeError as e:
73
+ raise ValueError(f"Invalid JSON from Plasmate: {e}") from e
74
+
75
+ # Handle wrapped output: {"som": {...}}
76
+ if isinstance(data, dict) and "som" in data and "som_version" not in data:
77
+ data = data["som"]
78
+
79
+ return Som.model_validate(data)
@@ -0,0 +1,221 @@
1
+ """Query, filter, and search utilities for SOM objects."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable, Dict, List, Optional, Union
6
+
7
+ from .types import ElementRole, RegionRole, Som, SomElement, SomRegion
8
+
9
+
10
+ def _collect_elements(elements: List[SomElement]) -> List[SomElement]:
11
+ """Recursively collect all elements including nested children."""
12
+ result: List[SomElement] = []
13
+ for el in elements:
14
+ result.append(el)
15
+ if el.children:
16
+ result.extend(_collect_elements(el.children))
17
+ return result
18
+
19
+
20
+ def get_all_elements(som: Som) -> List[SomElement]:
21
+ """Flatten all elements from all regions, including nested children."""
22
+ result: List[SomElement] = []
23
+ for region in som.regions:
24
+ result.extend(_collect_elements(region.elements))
25
+ return result
26
+
27
+
28
+ def find_by_role(som: Som, role: Union[ElementRole, str]) -> List[SomElement]:
29
+ """Find all elements with a specific role.
30
+
31
+ Args:
32
+ som: The parsed SOM object.
33
+ role: An ElementRole enum value or string (e.g. "link").
34
+ """
35
+ if isinstance(role, str):
36
+ role = ElementRole(role)
37
+ return [el for el in get_all_elements(som) if el.role == role]
38
+
39
+
40
+ def find_by_id(som: Som, id: str) -> Optional[SomElement]:
41
+ """Find an element by its SOM id. Returns None if not found."""
42
+ for el in get_all_elements(som):
43
+ if el.id == id:
44
+ return el
45
+ return None
46
+
47
+
48
+ def find_by_text(
49
+ som: Som, text: str, *, exact: bool = False
50
+ ) -> List[SomElement]:
51
+ """Find elements containing text.
52
+
53
+ Args:
54
+ som: The parsed SOM object.
55
+ text: The text to search for.
56
+ exact: If True, match the full text exactly (case-sensitive).
57
+ If False (default), case-insensitive substring match.
58
+ """
59
+ results: List[SomElement] = []
60
+ for el in get_all_elements(som):
61
+ el_text = el.text or ""
62
+ el_label = el.label or ""
63
+ if exact:
64
+ if text == el_text or text == el_label:
65
+ results.append(el)
66
+ else:
67
+ text_lower = text.lower()
68
+ if text_lower in el_text.lower() or text_lower in el_label.lower():
69
+ results.append(el)
70
+ return results
71
+
72
+
73
+ def get_interactive_elements(som: Som) -> List[SomElement]:
74
+ """Get all elements that have actions."""
75
+ return [el for el in get_all_elements(som) if el.actions]
76
+
77
+
78
+ def get_links(som: Som) -> List[Dict[str, Optional[str]]]:
79
+ """Extract all links as dicts with text, href, and id."""
80
+ links: List[Dict[str, Optional[str]]] = []
81
+ for el in find_by_role(som, ElementRole.LINK):
82
+ href = el.attrs.href if el.attrs else None
83
+ links.append({"text": el.text, "href": href, "id": el.id})
84
+ return links
85
+
86
+
87
+ def get_forms(som: Som) -> List[SomRegion]:
88
+ """Get all form regions."""
89
+ return [r for r in som.regions if r.role == RegionRole.FORM]
90
+
91
+
92
+ def get_inputs(som: Som) -> List[SomElement]:
93
+ """Get all input elements (text_input, textarea, select, checkbox, radio)."""
94
+ input_roles = {
95
+ ElementRole.TEXT_INPUT,
96
+ ElementRole.TEXTAREA,
97
+ ElementRole.SELECT,
98
+ ElementRole.CHECKBOX,
99
+ ElementRole.RADIO,
100
+ }
101
+ return [el for el in get_all_elements(som) if el.role in input_roles]
102
+
103
+
104
+ def get_headings(som: Som) -> List[Dict[str, object]]:
105
+ """Extract heading hierarchy as a list of dicts with level, text, and id."""
106
+ headings: List[Dict[str, object]] = []
107
+ for el in find_by_role(som, ElementRole.HEADING):
108
+ level = el.attrs.level if el.attrs and el.attrs.level is not None else 0
109
+ headings.append({"level": level, "text": el.text, "id": el.id})
110
+ return headings
111
+
112
+
113
+ def get_text(som: Som) -> str:
114
+ """Extract all visible text content from the SOM."""
115
+ parts: List[str] = []
116
+ for el in get_all_elements(som):
117
+ if el.text:
118
+ parts.append(el.text)
119
+ elif el.label:
120
+ parts.append(el.label)
121
+ return "\n".join(parts)
122
+
123
+
124
+ def get_text_by_region(som: Som) -> List[Dict[str, object]]:
125
+ """Extract text grouped by region."""
126
+ results: List[Dict[str, object]] = []
127
+ for region in som.regions:
128
+ texts: List[str] = []
129
+ for el in _collect_elements(region.elements):
130
+ if el.text:
131
+ texts.append(el.text)
132
+ elif el.label:
133
+ texts.append(el.label)
134
+ results.append({
135
+ "region_id": region.id,
136
+ "role": region.role.value,
137
+ "label": region.label,
138
+ "text": "\n".join(texts),
139
+ })
140
+ return results
141
+
142
+
143
+ def get_compression_ratio(som: Som) -> float:
144
+ """Return html_bytes / som_bytes compression ratio."""
145
+ if som.meta.som_bytes == 0:
146
+ return float("inf")
147
+ return som.meta.html_bytes / som.meta.som_bytes
148
+
149
+
150
+ def to_markdown(som: Som) -> str:
151
+ """Convert a SOM object to readable markdown."""
152
+ lines: List[str] = []
153
+ lines.append(f"# {som.title}")
154
+ lines.append(f"URL: {som.url}")
155
+ lines.append("")
156
+
157
+ for region in som.regions:
158
+ role_label = region.role.value.title()
159
+ if region.label:
160
+ lines.append(f"## {role_label}: {region.label}")
161
+ else:
162
+ lines.append(f"## {role_label}")
163
+ lines.append("")
164
+
165
+ for el in _collect_elements(region.elements):
166
+ _element_to_markdown(el, lines)
167
+
168
+ lines.append("")
169
+
170
+ return "\n".join(lines)
171
+
172
+
173
+ def _element_to_markdown(el: SomElement, lines: List[str]) -> None:
174
+ """Append markdown for a single element."""
175
+ role = el.role
176
+
177
+ if role == ElementRole.HEADING:
178
+ level = el.attrs.level if el.attrs and el.attrs.level else 1
179
+ prefix = "#" * (level + 2) # offset by 2 since region is ##
180
+ lines.append(f"{prefix} {el.text or ''}")
181
+ elif role == ElementRole.PARAGRAPH:
182
+ lines.append(el.text or "")
183
+ lines.append("")
184
+ elif role == ElementRole.LINK:
185
+ href = el.attrs.href if el.attrs else "#"
186
+ lines.append(f"[{el.text or ''}]({href})")
187
+ elif role == ElementRole.BUTTON:
188
+ lines.append(f"[Button: {el.text or ''}]")
189
+ elif role == ElementRole.IMAGE:
190
+ alt = el.attrs.alt if el.attrs else ""
191
+ src = el.attrs.src if el.attrs else ""
192
+ lines.append(f"![{alt}]({src})")
193
+ elif role in (ElementRole.TEXT_INPUT, ElementRole.TEXTAREA):
194
+ label = el.label or ""
195
+ placeholder = ""
196
+ if el.attrs and el.attrs.placeholder:
197
+ placeholder = f' placeholder="{el.attrs.placeholder}"'
198
+ lines.append(f"[Input: {label}{placeholder}]")
199
+ elif role == ElementRole.SELECT:
200
+ lines.append(f"[Select: {el.label or el.text or ''}]")
201
+ elif role in (ElementRole.CHECKBOX, ElementRole.RADIO):
202
+ checked = ""
203
+ if el.attrs and el.attrs.checked:
204
+ checked = "x"
205
+ lines.append(f"[{checked}] {el.text or el.label or ''}")
206
+ elif role == ElementRole.LIST:
207
+ if el.attrs and el.attrs.items:
208
+ for item in el.attrs.items:
209
+ lines.append(f"- {item.text}")
210
+ elif role == ElementRole.SEPARATOR:
211
+ lines.append("---")
212
+ else:
213
+ if el.text:
214
+ lines.append(el.text)
215
+
216
+
217
+ def filter_elements(
218
+ som: Som, predicate: Callable[[SomElement], bool]
219
+ ) -> List[SomElement]:
220
+ """Filter all elements using a predicate function."""
221
+ return [el for el in get_all_elements(som) if predicate(el)]
@@ -0,0 +1,152 @@
1
+ """Pydantic v2 models for SOM (Semantic Object Model)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from pydantic import BaseModel
9
+
10
+
11
+ class RegionRole(str, Enum):
12
+ NAVIGATION = "navigation"
13
+ MAIN = "main"
14
+ ASIDE = "aside"
15
+ HEADER = "header"
16
+ FOOTER = "footer"
17
+ FORM = "form"
18
+ DIALOG = "dialog"
19
+ CONTENT = "content"
20
+
21
+
22
+ class ElementRole(str, Enum):
23
+ LINK = "link"
24
+ BUTTON = "button"
25
+ TEXT_INPUT = "text_input"
26
+ TEXTAREA = "textarea"
27
+ SELECT = "select"
28
+ CHECKBOX = "checkbox"
29
+ RADIO = "radio"
30
+ HEADING = "heading"
31
+ IMAGE = "image"
32
+ LIST = "list"
33
+ TABLE = "table"
34
+ PARAGRAPH = "paragraph"
35
+ SECTION = "section"
36
+ SEPARATOR = "separator"
37
+
38
+
39
+ class ElementAction(str, Enum):
40
+ CLICK = "click"
41
+ TYPE = "type"
42
+ CLEAR = "clear"
43
+ SELECT = "select"
44
+ TOGGLE = "toggle"
45
+
46
+
47
+ class SemanticHint(str, Enum):
48
+ ACTIVE = "active"
49
+ BADGE = "badge"
50
+ CARD = "card"
51
+ COLLAPSED = "collapsed"
52
+ DANGER = "danger"
53
+ DISABLED = "disabled"
54
+ ERROR = "error"
55
+ EXPANDED = "expanded"
56
+ HERO = "hero"
57
+ HIDDEN = "hidden"
58
+ LARGE = "large"
59
+ LOADING = "loading"
60
+ MODAL = "modal"
61
+ NOTIFICATION = "notification"
62
+ PRIMARY = "primary"
63
+ REQUIRED = "required"
64
+ SECONDARY = "secondary"
65
+ SELECTED = "selected"
66
+ SMALL = "small"
67
+ STICKY = "sticky"
68
+ SUCCESS = "success"
69
+ WARNING = "warning"
70
+
71
+
72
+ class SelectOption(BaseModel):
73
+ value: str
74
+ text: str
75
+ selected: Optional[bool] = None
76
+
77
+
78
+ class ListItem(BaseModel):
79
+ text: str
80
+
81
+
82
+ class SomElementAttrs(BaseModel):
83
+ href: Optional[str] = None
84
+ input_type: Optional[str] = None
85
+ value: Optional[str] = None
86
+ placeholder: Optional[str] = None
87
+ required: Optional[bool] = None
88
+ disabled: Optional[bool] = None
89
+ checked: Optional[bool] = None
90
+ group: Optional[str] = None
91
+ multiple: Optional[bool] = None
92
+ options: Optional[List[SelectOption]] = None
93
+ level: Optional[int] = None
94
+ alt: Optional[str] = None
95
+ src: Optional[str] = None
96
+ ordered: Optional[bool] = None
97
+ items: Optional[List[ListItem]] = None
98
+ headers: Optional[List[str]] = None
99
+ rows: Optional[List[List[str]]] = None
100
+ section_label: Optional[str] = None
101
+
102
+
103
+ class SomElement(BaseModel):
104
+ id: str
105
+ role: ElementRole
106
+ text: Optional[str] = None
107
+ label: Optional[str] = None
108
+ actions: Optional[List[ElementAction]] = None
109
+ attrs: Optional[SomElementAttrs] = None
110
+ children: Optional[List[SomElement]] = None
111
+ hints: Optional[List[SemanticHint]] = None
112
+
113
+
114
+ class SomRegion(BaseModel):
115
+ id: str
116
+ role: RegionRole
117
+ label: Optional[str] = None
118
+ action: Optional[str] = None
119
+ method: Optional[str] = None
120
+ elements: List[SomElement]
121
+
122
+
123
+ class SomMeta(BaseModel):
124
+ html_bytes: int
125
+ som_bytes: int
126
+ element_count: int
127
+ interactive_count: int
128
+
129
+
130
+ class LinkElement(BaseModel):
131
+ rel: str
132
+ href: str
133
+ type: Optional[str] = None
134
+ hreflang: Optional[str] = None
135
+
136
+
137
+ class StructuredData(BaseModel):
138
+ json_ld: Optional[List[Dict[str, Any]]] = None
139
+ open_graph: Optional[Dict[str, str]] = None
140
+ twitter_card: Optional[Dict[str, str]] = None
141
+ meta: Optional[Dict[str, str]] = None
142
+ links: Optional[List[LinkElement]] = None
143
+
144
+
145
+ class Som(BaseModel):
146
+ som_version: str
147
+ url: str
148
+ title: str
149
+ lang: str
150
+ regions: List[SomRegion]
151
+ meta: SomMeta
152
+ structured_data: Optional[StructuredData] = None
File without changes
@@ -0,0 +1,421 @@
1
+ """Tests for som-parser package."""
2
+
3
+ import json
4
+
5
+ import pytest
6
+
7
+ from som_parser import (
8
+ ElementRole,
9
+ RegionRole,
10
+ Som,
11
+ SomElement,
12
+ filter_elements,
13
+ find_by_id,
14
+ find_by_role,
15
+ find_by_text,
16
+ from_plasmate,
17
+ get_all_elements,
18
+ get_compression_ratio,
19
+ get_forms,
20
+ get_headings,
21
+ get_inputs,
22
+ get_interactive_elements,
23
+ get_links,
24
+ get_text,
25
+ get_text_by_region,
26
+ is_valid_som,
27
+ parse_som,
28
+ to_markdown,
29
+ )
30
+
31
+ FIXTURE_SOM = {
32
+ "som_version": "0.1",
33
+ "url": "https://example.com/",
34
+ "title": "Example Domain",
35
+ "lang": "en",
36
+ "regions": [
37
+ {
38
+ "id": "r_nav",
39
+ "role": "navigation",
40
+ "elements": [
41
+ {
42
+ "id": "e_1",
43
+ "role": "link",
44
+ "text": "Home",
45
+ "actions": ["click"],
46
+ "attrs": {"href": "/"},
47
+ },
48
+ {
49
+ "id": "e_2",
50
+ "role": "link",
51
+ "text": "About",
52
+ "actions": ["click"],
53
+ "attrs": {"href": "/about"},
54
+ },
55
+ ],
56
+ },
57
+ {
58
+ "id": "r_content",
59
+ "role": "content",
60
+ "elements": [
61
+ {
62
+ "id": "e_3",
63
+ "role": "heading",
64
+ "text": "Welcome",
65
+ "attrs": {"level": 1},
66
+ },
67
+ {
68
+ "id": "e_4",
69
+ "role": "paragraph",
70
+ "text": "This is a test page.",
71
+ },
72
+ {
73
+ "id": "e_5",
74
+ "role": "link",
75
+ "text": "Learn more",
76
+ "actions": ["click"],
77
+ "attrs": {"href": "https://example.org"},
78
+ },
79
+ {
80
+ "id": "e_6",
81
+ "role": "image",
82
+ "attrs": {"src": "/logo.png", "alt": "Logo"},
83
+ },
84
+ ],
85
+ },
86
+ {
87
+ "id": "r_form",
88
+ "role": "form",
89
+ "action": "/search",
90
+ "method": "GET",
91
+ "elements": [
92
+ {
93
+ "id": "e_7",
94
+ "role": "text_input",
95
+ "label": "Search",
96
+ "actions": ["type", "clear"],
97
+ "attrs": {"input_type": "text", "placeholder": "Search..."},
98
+ },
99
+ {
100
+ "id": "e_8",
101
+ "role": "button",
102
+ "text": "Go",
103
+ "actions": ["click"],
104
+ },
105
+ ],
106
+ },
107
+ ],
108
+ "meta": {
109
+ "html_bytes": 5000,
110
+ "som_bytes": 800,
111
+ "element_count": 8,
112
+ "interactive_count": 5,
113
+ },
114
+ }
115
+
116
+
117
+ @pytest.fixture
118
+ def som() -> Som:
119
+ return parse_som(FIXTURE_SOM)
120
+
121
+
122
+ @pytest.fixture
123
+ def som_json() -> str:
124
+ return json.dumps(FIXTURE_SOM)
125
+
126
+
127
+ # --- Parser tests ---
128
+
129
+
130
+ class TestParseSom:
131
+ def test_parse_dict(self, som: Som):
132
+ assert isinstance(som, Som)
133
+ assert som.title == "Example Domain"
134
+ assert som.url == "https://example.com/"
135
+ assert som.som_version == "0.1"
136
+ assert som.lang == "en"
137
+ assert len(som.regions) == 3
138
+
139
+ def test_parse_json_string(self, som_json: str):
140
+ result = parse_som(som_json)
141
+ assert isinstance(result, Som)
142
+ assert result.title == "Example Domain"
143
+
144
+ def test_parse_invalid_json_string(self):
145
+ with pytest.raises(ValueError, match="Invalid JSON"):
146
+ parse_som("not valid json {{{")
147
+
148
+ def test_parse_invalid_schema(self):
149
+ with pytest.raises(Exception):
150
+ parse_som({"not": "a som"})
151
+
152
+ def test_parse_wrong_type(self):
153
+ with pytest.raises(TypeError):
154
+ parse_som(42) # type: ignore
155
+
156
+ def test_regions_parsed(self, som: Som):
157
+ assert som.regions[0].role == RegionRole.NAVIGATION
158
+ assert som.regions[1].role == RegionRole.CONTENT
159
+ assert som.regions[2].role == RegionRole.FORM
160
+
161
+ def test_elements_parsed(self, som: Som):
162
+ nav_elements = som.regions[0].elements
163
+ assert len(nav_elements) == 2
164
+ assert nav_elements[0].role == ElementRole.LINK
165
+ assert nav_elements[0].text == "Home"
166
+
167
+ def test_meta_parsed(self, som: Som):
168
+ assert som.meta.html_bytes == 5000
169
+ assert som.meta.som_bytes == 800
170
+ assert som.meta.element_count == 8
171
+ assert som.meta.interactive_count == 5
172
+
173
+ def test_form_region_attrs(self, som: Som):
174
+ form = som.regions[2]
175
+ assert form.action == "/search"
176
+ assert form.method == "GET"
177
+
178
+
179
+ class TestIsValidSom:
180
+ def test_valid(self):
181
+ assert is_valid_som(FIXTURE_SOM) is True
182
+
183
+ def test_valid_string(self):
184
+ assert is_valid_som(json.dumps(FIXTURE_SOM)) is True
185
+
186
+ def test_invalid_dict(self):
187
+ assert is_valid_som({"bad": "data"}) is False
188
+
189
+ def test_invalid_string(self):
190
+ assert is_valid_som("nope") is False
191
+
192
+ def test_invalid_type(self):
193
+ assert is_valid_som(123) is False
194
+
195
+
196
+ class TestFromPlasmate:
197
+ def test_direct_som(self):
198
+ result = from_plasmate(json.dumps(FIXTURE_SOM))
199
+ assert result.title == "Example Domain"
200
+
201
+ def test_wrapped_som(self):
202
+ wrapped = json.dumps({"som": FIXTURE_SOM})
203
+ result = from_plasmate(wrapped)
204
+ assert result.title == "Example Domain"
205
+
206
+ def test_invalid_json(self):
207
+ with pytest.raises(ValueError, match="Invalid JSON"):
208
+ from_plasmate("not json")
209
+
210
+
211
+ # --- Query tests ---
212
+
213
+
214
+ class TestGetAllElements:
215
+ def test_count(self, som: Som):
216
+ elements = get_all_elements(som)
217
+ assert len(elements) == 8
218
+
219
+ def test_all_have_ids(self, som: Som):
220
+ elements = get_all_elements(som)
221
+ ids = [el.id for el in elements]
222
+ assert ids == ["e_1", "e_2", "e_3", "e_4", "e_5", "e_6", "e_7", "e_8"]
223
+
224
+
225
+ class TestFindByRole:
226
+ def test_links(self, som: Som):
227
+ links = find_by_role(som, ElementRole.LINK)
228
+ assert len(links) == 3
229
+
230
+ def test_string_role(self, som: Som):
231
+ links = find_by_role(som, "link")
232
+ assert len(links) == 3
233
+
234
+ def test_headings(self, som: Som):
235
+ headings = find_by_role(som, ElementRole.HEADING)
236
+ assert len(headings) == 1
237
+ assert headings[0].text == "Welcome"
238
+
239
+ def test_buttons(self, som: Som):
240
+ buttons = find_by_role(som, ElementRole.BUTTON)
241
+ assert len(buttons) == 1
242
+ assert buttons[0].text == "Go"
243
+
244
+ def test_no_results(self, som: Som):
245
+ tables = find_by_role(som, ElementRole.TABLE)
246
+ assert tables == []
247
+
248
+
249
+ class TestFindById:
250
+ def test_found(self, som: Som):
251
+ el = find_by_id(som, "e_3")
252
+ assert el is not None
253
+ assert el.text == "Welcome"
254
+ assert el.role == ElementRole.HEADING
255
+
256
+ def test_not_found(self, som: Som):
257
+ assert find_by_id(som, "nonexistent") is None
258
+
259
+
260
+ class TestFindByText:
261
+ def test_substring(self, som: Som):
262
+ results = find_by_text(som, "home")
263
+ assert len(results) == 1
264
+ assert results[0].id == "e_1"
265
+
266
+ def test_case_insensitive(self, som: Som):
267
+ results = find_by_text(som, "WELCOME")
268
+ assert len(results) == 1
269
+
270
+ def test_exact_match(self, som: Som):
271
+ results = find_by_text(som, "Home", exact=True)
272
+ assert len(results) == 1
273
+
274
+ def test_exact_no_match(self, som: Som):
275
+ results = find_by_text(som, "home", exact=True)
276
+ assert len(results) == 0
277
+
278
+ def test_label_match(self, som: Som):
279
+ results = find_by_text(som, "search")
280
+ assert len(results) == 1
281
+ assert results[0].id == "e_7"
282
+
283
+ def test_no_match(self, som: Som):
284
+ results = find_by_text(som, "xyznonexistent")
285
+ assert len(results) == 0
286
+
287
+
288
+ class TestGetInteractiveElements:
289
+ def test_count(self, som: Som):
290
+ interactive = get_interactive_elements(som)
291
+ assert len(interactive) == 5
292
+
293
+ def test_all_have_actions(self, som: Som):
294
+ interactive = get_interactive_elements(som)
295
+ for el in interactive:
296
+ assert el.actions is not None
297
+ assert len(el.actions) > 0
298
+
299
+
300
+ class TestGetLinks:
301
+ def test_links(self, som: Som):
302
+ links = get_links(som)
303
+ assert len(links) == 3
304
+ assert links[0] == {"text": "Home", "href": "/", "id": "e_1"}
305
+ assert links[1] == {"text": "About", "href": "/about", "id": "e_2"}
306
+ assert links[2] == {
307
+ "text": "Learn more",
308
+ "href": "https://example.org",
309
+ "id": "e_5",
310
+ }
311
+
312
+
313
+ class TestGetForms:
314
+ def test_forms(self, som: Som):
315
+ forms = get_forms(som)
316
+ assert len(forms) == 1
317
+ assert forms[0].id == "r_form"
318
+ assert forms[0].action == "/search"
319
+
320
+
321
+ class TestGetInputs:
322
+ def test_inputs(self, som: Som):
323
+ inputs = get_inputs(som)
324
+ assert len(inputs) == 1
325
+ assert inputs[0].id == "e_7"
326
+ assert inputs[0].role == ElementRole.TEXT_INPUT
327
+
328
+
329
+ class TestGetHeadings:
330
+ def test_headings(self, som: Som):
331
+ headings = get_headings(som)
332
+ assert len(headings) == 1
333
+ assert headings[0] == {"level": 1, "text": "Welcome", "id": "e_3"}
334
+
335
+
336
+ class TestGetText:
337
+ def test_text(self, som: Som):
338
+ text = get_text(som)
339
+ assert "Home" in text
340
+ assert "About" in text
341
+ assert "Welcome" in text
342
+ assert "This is a test page." in text
343
+ assert "Learn more" in text
344
+ assert "Search" in text
345
+ assert "Go" in text
346
+
347
+
348
+ class TestGetTextByRegion:
349
+ def test_regions(self, som: Som):
350
+ regions = get_text_by_region(som)
351
+ assert len(regions) == 3
352
+ assert regions[0]["region_id"] == "r_nav"
353
+ assert regions[0]["role"] == "navigation"
354
+ assert "Home" in regions[0]["text"]
355
+
356
+ def test_content_region(self, som: Som):
357
+ regions = get_text_by_region(som)
358
+ content = regions[1]
359
+ assert content["role"] == "content"
360
+ assert "Welcome" in content["text"]
361
+ assert "This is a test page." in content["text"]
362
+
363
+
364
+ class TestGetCompressionRatio:
365
+ def test_ratio(self, som: Som):
366
+ ratio = get_compression_ratio(som)
367
+ assert ratio == 5000 / 800
368
+ assert ratio == 6.25
369
+
370
+
371
+ class TestToMarkdown:
372
+ def test_contains_title(self, som: Som):
373
+ md = to_markdown(som)
374
+ assert "# Example Domain" in md
375
+
376
+ def test_contains_url(self, som: Som):
377
+ md = to_markdown(som)
378
+ assert "URL: https://example.com/" in md
379
+
380
+ def test_contains_regions(self, som: Som):
381
+ md = to_markdown(som)
382
+ assert "## Navigation" in md
383
+ assert "## Content" in md
384
+ assert "## Form" in md
385
+
386
+ def test_contains_links(self, som: Som):
387
+ md = to_markdown(som)
388
+ assert "[Home](/)" in md
389
+ assert "[About](/about)" in md
390
+
391
+ def test_contains_heading(self, som: Som):
392
+ md = to_markdown(som)
393
+ assert "### Welcome" in md
394
+
395
+ def test_contains_paragraph(self, som: Som):
396
+ md = to_markdown(som)
397
+ assert "This is a test page." in md
398
+
399
+ def test_contains_image(self, som: Som):
400
+ md = to_markdown(som)
401
+ assert "![Logo](/logo.png)" in md
402
+
403
+ def test_contains_button(self, som: Som):
404
+ md = to_markdown(som)
405
+ assert "[Button: Go]" in md
406
+
407
+ def test_contains_input(self, som: Som):
408
+ md = to_markdown(som)
409
+ assert "Input: Search" in md
410
+
411
+
412
+ class TestFilterElements:
413
+ def test_filter_by_actions(self, som: Som):
414
+ clickable = filter_elements(
415
+ som, lambda el: el.actions is not None and "click" in [a.value for a in el.actions]
416
+ )
417
+ assert len(clickable) == 4 # 3 links + 1 button
418
+
419
+ def test_filter_by_text(self, som: Som):
420
+ with_text = filter_elements(som, lambda el: el.text is not None)
421
+ assert len(with_text) == 6 # all except image and text_input