play-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. play_parser/__init__.py +32 -0
  2. play_parser/__main__.py +6 -0
  3. play_parser/_io.py +51 -0
  4. play_parser/cli/__init__.py +3 -0
  5. play_parser/cli/main.py +240 -0
  6. play_parser/document/__init__.py +21 -0
  7. play_parser/document/assembler.py +185 -0
  8. play_parser/document/builder.py +170 -0
  9. play_parser/document/constants.py +17 -0
  10. play_parser/document/text.py +15 -0
  11. play_parser/document/types.py +92 -0
  12. play_parser/document/validation.py +250 -0
  13. play_parser/domain/__init__.py +11 -0
  14. play_parser/domain/play.py +743 -0
  15. play_parser/ingestion/__init__.py +3 -0
  16. play_parser/ingestion/ingestor.py +181 -0
  17. play_parser/parsing/__init__.py +18 -0
  18. play_parser/parsing/context.py +103 -0
  19. play_parser/parsing/front_matter.py +86 -0
  20. play_parser/parsing/parser.py +292 -0
  21. play_parser/parsing/profiles/__init__.py +15 -0
  22. play_parser/parsing/profiles/builtins/__init__.py +1 -0
  23. play_parser/parsing/profiles/builtins/colon_inline.json +15 -0
  24. play_parser/parsing/profiles/builtins/dot_block.json +16 -0
  25. play_parser/parsing/profiles/builtins/dot_inline.json +16 -0
  26. play_parser/parsing/profiles/builtins/mixed_parenthetical.json +17 -0
  27. play_parser/parsing/profiles/builtins/narrative_stage_heavy.json +17 -0
  28. play_parser/parsing/profiles/loader.py +149 -0
  29. play_parser/parsing/profiles/schema.py +139 -0
  30. play_parser/parsing/speakers.py +130 -0
  31. play_parser/parsing/speech.py +604 -0
  32. play_parser/parsing/stage.py +178 -0
  33. play_parser/parsing/structure.py +87 -0
  34. play_parser/py.typed +0 -0
  35. play_parser-1.0.0.dist-info/METADATA +195 -0
  36. play_parser-1.0.0.dist-info/RECORD +40 -0
  37. play_parser-1.0.0.dist-info/WHEEL +5 -0
  38. play_parser-1.0.0.dist-info/entry_points.txt +2 -0
  39. play_parser-1.0.0.dist-info/licenses/LICENSE +21 -0
  40. play_parser-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,170 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from collections.abc import Sequence
6
+ from pathlib import Path
7
+
8
+ from play_parser._io import read_text_file
9
+ from play_parser.document.text import count_words
10
+ from play_parser.parsing.parser import parse_play_text
11
+ from play_parser.parsing.profiles import FormatProfile
12
+
13
+ from .types import PlayDocument, PlayEvent
14
+ from .validation import validate_events
15
+
16
+ WHITESPACE_RE = re.compile(r"\s+")
17
+ DEFAULT_JSON_INDENT = 2
18
+ AUTHOR_BYLINE_PREFIX = "by "
19
+
20
+
21
+ def build_play_events(
22
+ text: str,
23
+ *,
24
+ profile: FormatProfile | None = None,
25
+ ) -> list[PlayEvent]:
26
+ events = parse_play_text(text, profile=profile)
27
+ validate_events(events)
28
+ return events
29
+
30
+
31
+ def build_play_document(
32
+ text: str,
33
+ *,
34
+ play_name: str | None = None,
35
+ profile: FormatProfile | None = None,
36
+ ) -> PlayDocument:
37
+ events = build_play_events(text, profile=profile)
38
+ return {
39
+ "metadata": build_play_metadata(events, text, play_name=play_name),
40
+ "events": events,
41
+ }
42
+
43
+
44
+ def build_play_events_from_file(path: str | Path, *, profile: FormatProfile | None = None) -> list[PlayEvent]:
45
+ input_path = Path(path)
46
+ return build_play_events(
47
+ read_text_file(input_path),
48
+ profile=profile,
49
+ )
50
+
51
+
52
+ def build_play_document_from_file(path: str | Path, *, profile: FormatProfile | None = None) -> PlayDocument:
53
+ input_path = Path(path)
54
+ return build_play_document(
55
+ read_text_file(input_path),
56
+ play_name=input_path.stem,
57
+ profile=profile,
58
+ )
59
+
60
+
61
+ def write_json_file(
62
+ input_path: str | Path,
63
+ output_path: str | Path,
64
+ *,
65
+ indent: int = DEFAULT_JSON_INDENT,
66
+ profile: FormatProfile | None = None,
67
+ ) -> PlayDocument:
68
+ document = build_play_document_from_file(input_path, profile=profile)
69
+ output_file = Path(output_path)
70
+ output_file.parent.mkdir(parents=True, exist_ok=True)
71
+ output_file.write_text(json.dumps(document, ensure_ascii=False, indent=indent) + "\n", encoding="utf-8")
72
+ return document
73
+
74
+
75
+ def build_play_metadata(
76
+ events: Sequence[PlayEvent],
77
+ raw_text: str,
78
+ *,
79
+ play_name: str | None = None,
80
+ ) -> dict[str, object]:
81
+ title = extract_title(events, play_name=play_name)
82
+ author = extract_author(events)
83
+ acts, scenes = collect_structure(events)
84
+
85
+ return {
86
+ "title": title,
87
+ "author": author,
88
+ "characters": collect_characters(events),
89
+ "acts": acts,
90
+ "scenes": scenes,
91
+ "stats": {
92
+ "stage_directions": sum(1 for event in events if event["type"] == "stage_direction"),
93
+ "speeches": sum(1 for event in events if event["type"] == "speech"),
94
+ "scenes": len(scenes),
95
+ "acts": len(acts),
96
+ "source_words": count_words(raw_text),
97
+ "spoken_words": sum(count_words(event["text"]) for event in events if event["type"] == "speech"),
98
+ },
99
+ }
100
+
101
+
102
+ def extract_title(events: Sequence[PlayEvent], *, play_name: str | None) -> str | None:
103
+ for event in events:
104
+ if event["type"] == "meta" and event["subtype"] == "title":
105
+ return event["text"]
106
+ return play_name
107
+
108
+
109
+ def extract_author(events: Sequence[PlayEvent]) -> str | None:
110
+ for event in events:
111
+ if event["type"] == "meta" and event["subtype"] == "author":
112
+ author = event["text"].strip()
113
+ if author.lower().startswith(AUTHOR_BYLINE_PREFIX):
114
+ return author[len(AUTHOR_BYLINE_PREFIX) :].strip() or None
115
+ return author or None
116
+ return None
117
+
118
+
119
+ def collect_characters(events: Sequence[PlayEvent]) -> list[str]:
120
+ characters: list[str] = []
121
+ seen: set[str] = set()
122
+
123
+ for event in events:
124
+ if event["type"] == "stage_direction" and event.get("subtype") == "character":
125
+ add_character(characters, seen, event["text"])
126
+ elif event["type"] == "speech":
127
+ add_character(characters, seen, event["speaker"])
128
+
129
+ return characters
130
+
131
+
132
+ def add_character(characters: list[str], seen: set[str], value: str) -> None:
133
+ name = value.strip()
134
+ if not name:
135
+ return
136
+ marker = normalize_inline_match_text(name)
137
+ if marker in seen:
138
+ return
139
+ seen.add(marker)
140
+ characters.append(name)
141
+
142
+
143
+ def collect_structure(events: Sequence[PlayEvent]) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
144
+ acts: list[dict[str, object]] = []
145
+ scenes: list[dict[str, object]] = []
146
+ current_act_number: int | None = None
147
+
148
+ for event in events:
149
+ if event["type"] == "act_start":
150
+ current_act_number = event["act_number"]
151
+ acts.append(
152
+ {
153
+ "act_number": event["act_number"],
154
+ "label": event["label"],
155
+ }
156
+ )
157
+ elif event["type"] == "scene_start":
158
+ scenes.append(
159
+ {
160
+ "act_number": current_act_number,
161
+ "scene_number": event.get("scene_number"),
162
+ "label": event["label"],
163
+ }
164
+ )
165
+
166
+ return acts, scenes
167
+
168
+
169
+ def normalize_inline_match_text(text: str) -> str:
170
+ return WHITESPACE_RE.sub(" ", text).strip().casefold()
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ WORD_RE = re.compile(r"[0-9A-Za-zÀ-ÖØ-öø-ÿ]+(?:['’][0-9A-Za-zÀ-ÖØ-öø-ÿ]+)*")
6
+
7
+ STAGE_DIRECTION_SUBTYPES = (
8
+ "enter",
9
+ "exit",
10
+ "action",
11
+ "aside",
12
+ "setting",
13
+ "sound",
14
+ "cast_list_heading",
15
+ "character",
16
+ )
17
+ VALID_STAGE_DIRECTION_SUBTYPES = frozenset(STAGE_DIRECTION_SUBTYPES)
@@ -0,0 +1,15 @@
1
+ """Shared text helpers for parser and document validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .constants import WORD_RE
6
+
7
+
8
+ def count_words(text: str) -> int:
9
+ """Count words using the canonical parser word-token rule."""
10
+ return len(WORD_RE.findall(text))
11
+
12
+
13
+ def normalise_whitespace(text: str) -> str:
14
+ """Collapse all runs of whitespace in ``text`` to single spaces."""
15
+ return " ".join(text.split())
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal, TypedDict
4
+
5
+
6
+ class ActStartEvent(TypedDict):
7
+ type: Literal["act_start"]
8
+ act_number: int
9
+ label: str
10
+
11
+
12
+ class SceneStartEvent(TypedDict):
13
+ type: Literal["scene_start"]
14
+ scene_number: int | None
15
+ label: str
16
+
17
+
18
+ class SpeechEventRequired(TypedDict):
19
+ type: Literal["speech"]
20
+ speaker: str
21
+ text: str
22
+ line_count: int
23
+ word_count: int
24
+
25
+
26
+ class SpeechEvent(SpeechEventRequired, total=False):
27
+ labelled: bool
28
+
29
+
30
+ StageDirectionSubtype = Literal[
31
+ "enter",
32
+ "exit",
33
+ "action",
34
+ "aside",
35
+ "setting",
36
+ "sound",
37
+ "cast_list_heading",
38
+ "character",
39
+ ]
40
+
41
+
42
+ class StageDirectionEventRequired(TypedDict):
43
+ type: Literal["stage_direction"]
44
+ subtype: StageDirectionSubtype
45
+ text: str
46
+
47
+
48
+ class StageDirectionEvent(StageDirectionEventRequired, total=False):
49
+ attachment: Literal["next"]
50
+
51
+
52
+ class MetaEvent(TypedDict):
53
+ type: Literal["meta"]
54
+ subtype: str
55
+ text: str
56
+
57
+
58
+ class ActSummary(TypedDict):
59
+ act_number: int
60
+ label: str
61
+
62
+
63
+ class SceneSummary(TypedDict):
64
+ act_number: int | None
65
+ scene_number: int | None
66
+ label: str
67
+
68
+
69
+ class PlayStats(TypedDict):
70
+ stage_directions: int
71
+ speeches: int
72
+ scenes: int
73
+ acts: int
74
+ source_words: int
75
+ spoken_words: int
76
+
77
+
78
+ class PlayMetadata(TypedDict):
79
+ title: str | None
80
+ author: str | None
81
+ characters: list[str]
82
+ acts: list[ActSummary]
83
+ scenes: list[SceneSummary]
84
+ stats: PlayStats
85
+
86
+
87
+ class PlayDocument(TypedDict):
88
+ metadata: PlayMetadata
89
+ events: list[PlayEvent]
90
+
91
+
92
+ PlayEvent = ActStartEvent | SceneStartEvent | SpeechEvent | StageDirectionEvent | MetaEvent
@@ -0,0 +1,250 @@
1
+ """
2
+ Validation functions for play parser events and roundtrip testing.
3
+ """
4
+
5
+ from collections.abc import Sequence
6
+ from typing import cast
7
+
8
+ from .assembler import assemble_play_text
9
+ from .constants import VALID_STAGE_DIRECTION_SUBTYPES
10
+ from .types import PlayDocument, PlayEvent, PlayMetadata
11
+
12
+ DEFAULT_DIFF_CONTEXT_SIZE = 50
13
+ REQUIRED_METADATA_FIELDS = {"title", "author", "characters", "acts", "scenes", "stats"}
14
+ REQUIRED_ACT_SUMMARY_FIELDS = {"act_number", "label"}
15
+ REQUIRED_SCENE_SUMMARY_FIELDS = {"act_number", "scene_number", "label"}
16
+ REQUIRED_STATS_FIELDS = {"stage_directions", "speeches", "scenes", "acts", "source_words", "spoken_words"}
17
+
18
+
19
+ def count_lines(text: str) -> int:
20
+ """Count the number of lines in text."""
21
+ if not text:
22
+ return 0
23
+ return len(text.splitlines())
24
+
25
+
26
+ def normalize_for_comparison(text: str) -> str:
27
+ """Normalize text for comparison by removing all whitespace and converting to lowercase."""
28
+ return "".join(text.lower().split())
29
+
30
+
31
+ def is_integer(value: object) -> bool:
32
+ return isinstance(value, int) and not isinstance(value, bool)
33
+
34
+
35
+ def is_non_negative_integer(value: object) -> bool:
36
+ return is_integer(value) and value >= 0
37
+
38
+
39
+ def require_fields(event: dict[str, object], required_fields: set[str], event_type: str) -> None:
40
+ missing = required_fields - event.keys()
41
+ if missing:
42
+ raise ValueError(f"{event_type} event missing required fields: {missing}")
43
+
44
+
45
+ def validate_metadata(metadata: object) -> PlayMetadata:
46
+ if not isinstance(metadata, dict):
47
+ raise ValueError("Canonical play JSON must contain 'metadata' as an object")
48
+
49
+ require_metadata_fields(metadata)
50
+ validate_title_and_author(metadata)
51
+ validate_character_summaries(metadata["characters"])
52
+ validate_act_summaries(metadata["acts"])
53
+ validate_scene_summaries(metadata["scenes"])
54
+ validate_stats(metadata["stats"])
55
+
56
+ return cast(PlayMetadata, metadata)
57
+
58
+
59
+ def require_metadata_fields(metadata: dict[str, object]) -> None:
60
+ missing = REQUIRED_METADATA_FIELDS - metadata.keys()
61
+ if missing:
62
+ raise ValueError(f"Canonical play JSON metadata is missing required fields: {sorted(missing)}")
63
+
64
+
65
+ def validate_title_and_author(metadata: dict[str, object]) -> None:
66
+ if metadata["title"] is not None and not isinstance(metadata["title"], str):
67
+ raise ValueError("Canonical play JSON metadata.title must be a string or null")
68
+ if metadata["author"] is not None and not isinstance(metadata["author"], str):
69
+ raise ValueError("Canonical play JSON metadata.author must be a string or null")
70
+
71
+
72
+ def validate_character_summaries(characters: object) -> None:
73
+ if not isinstance(characters, list) or not all(isinstance(item, str) for item in characters):
74
+ raise ValueError("Canonical play JSON metadata.characters must be a list of strings")
75
+
76
+
77
+ def validate_act_summaries(acts: object) -> None:
78
+ if not isinstance(acts, list):
79
+ raise ValueError("Canonical play JSON metadata.acts must be a list")
80
+
81
+ for index, act in enumerate(acts):
82
+ validate_act_summary(act, index)
83
+
84
+
85
+ def validate_act_summary(act: object, index: int) -> None:
86
+ if not isinstance(act, dict):
87
+ raise ValueError(f"Canonical play JSON metadata.acts[{index}] must be an object")
88
+
89
+ missing_act_fields = REQUIRED_ACT_SUMMARY_FIELDS - act.keys()
90
+ if missing_act_fields:
91
+ raise ValueError(
92
+ f"Canonical play JSON metadata.acts[{index}] is missing required fields: {sorted(missing_act_fields)}"
93
+ )
94
+ if not is_integer(act["act_number"]):
95
+ raise ValueError(f"Canonical play JSON metadata.acts[{index}].act_number must be an integer")
96
+ if not isinstance(act["label"], str):
97
+ raise ValueError(f"Canonical play JSON metadata.acts[{index}].label must be a string")
98
+
99
+
100
+ def validate_scene_summaries(scenes: object) -> None:
101
+ if not isinstance(scenes, list):
102
+ raise ValueError("Canonical play JSON metadata.scenes must be a list")
103
+
104
+ for index, scene in enumerate(scenes):
105
+ validate_scene_summary(scene, index)
106
+
107
+
108
+ def validate_scene_summary(scene: object, index: int) -> None:
109
+ if not isinstance(scene, dict):
110
+ raise ValueError(f"Canonical play JSON metadata.scenes[{index}] must be an object")
111
+
112
+ missing_scene_fields = REQUIRED_SCENE_SUMMARY_FIELDS - scene.keys()
113
+ if missing_scene_fields:
114
+ raise ValueError(
115
+ f"Canonical play JSON metadata.scenes[{index}] is missing required fields: {sorted(missing_scene_fields)}"
116
+ )
117
+ if scene["act_number"] is not None and not is_integer(scene["act_number"]):
118
+ raise ValueError(f"Canonical play JSON metadata.scenes[{index}].act_number must be an integer or null")
119
+ if scene["scene_number"] is not None and not is_integer(scene["scene_number"]):
120
+ raise ValueError(f"Canonical play JSON metadata.scenes[{index}].scene_number must be an integer or null")
121
+ if not isinstance(scene["label"], str):
122
+ raise ValueError(f"Canonical play JSON metadata.scenes[{index}].label must be a string")
123
+
124
+
125
+ def validate_stats(stats: object) -> None:
126
+ if not isinstance(stats, dict):
127
+ raise ValueError("Canonical play JSON metadata.stats must be an object")
128
+
129
+ missing_stats_fields = REQUIRED_STATS_FIELDS - stats.keys()
130
+ if missing_stats_fields:
131
+ raise ValueError(
132
+ f"Canonical play JSON metadata.stats is missing required fields: {sorted(missing_stats_fields)}"
133
+ )
134
+ for field_name in sorted(REQUIRED_STATS_FIELDS):
135
+ value = stats[field_name]
136
+ if not is_integer(value):
137
+ raise ValueError(f"Canonical play JSON metadata.stats.{field_name} must be an integer")
138
+
139
+
140
+ def validate_play_document(document: object) -> PlayDocument:
141
+ if not isinstance(document, dict):
142
+ raise ValueError("Canonical play JSON must be an object with 'metadata' and 'events'")
143
+
144
+ metadata = document.get("metadata")
145
+ events = document.get("events")
146
+ if not isinstance(metadata, dict) or not isinstance(events, list):
147
+ raise ValueError("Canonical play JSON must contain 'metadata' and 'events'")
148
+
149
+ validated_metadata = validate_metadata(metadata)
150
+ validate_events(events)
151
+ return {
152
+ "metadata": validated_metadata,
153
+ "events": cast(list[PlayEvent], events),
154
+ }
155
+
156
+
157
+ def validate_act_start_event(event: dict[str, object]) -> None:
158
+ require_fields(event, {"type", "act_number", "label"}, "act_start")
159
+ if not is_integer(event["act_number"]):
160
+ raise ValueError("act_start.act_number must be an integer")
161
+
162
+
163
+ def validate_scene_start_event(event: dict[str, object]) -> None:
164
+ require_fields(event, {"type", "label"}, "scene_start")
165
+ if "scene_number" in event and event["scene_number"] is not None and not is_integer(event["scene_number"]):
166
+ raise ValueError("scene_start.scene_number must be an integer or null")
167
+
168
+
169
+ def validate_speech_event(event: dict[str, object]) -> None:
170
+ require_fields(event, {"type", "speaker", "text", "line_count", "word_count"}, "speech")
171
+ if not event["speaker"]:
172
+ raise ValueError("speech event must have non-empty speaker")
173
+ if not is_non_negative_integer(event["line_count"]):
174
+ raise ValueError("speech.line_count must be a non-negative integer")
175
+ if not is_non_negative_integer(event["word_count"]):
176
+ raise ValueError("speech.word_count must be a non-negative integer")
177
+ if "labelled" in event and not isinstance(event["labelled"], bool):
178
+ raise ValueError("speech.labelled must be a boolean when present")
179
+
180
+
181
+ def validate_stage_direction_event(event: dict[str, object]) -> None:
182
+ require_fields(event, {"type", "subtype", "text"}, "stage_direction")
183
+ if not isinstance(event["subtype"], str) or event["subtype"] not in VALID_STAGE_DIRECTION_SUBTYPES:
184
+ raise ValueError(f"Invalid stage_direction subtype: {event['subtype']}")
185
+ if "attachment" in event and event["attachment"] is not None and event["attachment"] != "next":
186
+ raise ValueError("stage_direction.attachment must be 'next' when present")
187
+
188
+
189
+ def validate_meta_event(event: dict[str, object]) -> None:
190
+ require_fields(event, {"type", "subtype", "text"}, "meta")
191
+
192
+
193
+ _EVENT_VALIDATORS = {
194
+ "act_start": validate_act_start_event,
195
+ "scene_start": validate_scene_start_event,
196
+ "speech": validate_speech_event,
197
+ "stage_direction": validate_stage_direction_event,
198
+ "meta": validate_meta_event,
199
+ }
200
+
201
+
202
+ def validate_event_schema(event: PlayEvent) -> None:
203
+ """Validate that an event conforms to the expected schema."""
204
+ if not isinstance(event, dict):
205
+ raise ValueError(f"Event must be a dict, got {type(event)}")
206
+ if "type" not in event:
207
+ raise ValueError("Event must have 'type' field")
208
+
209
+ event_type = event["type"]
210
+ validator = _EVENT_VALIDATORS.get(event_type)
211
+ if validator is None:
212
+ raise ValueError(f"Unknown event type: {event_type}")
213
+ validator(cast(dict[str, object], event))
214
+
215
+
216
+ def validate_events(events: Sequence[PlayEvent]) -> None:
217
+ for index, event in enumerate(events):
218
+ try:
219
+ validate_event_schema(event)
220
+ except ValueError as exc:
221
+ raise ValueError(f"Invalid event at index {index}: {exc}") from exc
222
+
223
+
224
+ def validate_roundtrip(original_text: str, events: Sequence[PlayEvent]) -> bool:
225
+ assembled = assemble_play_text(events)
226
+ return normalize_for_comparison(original_text) == normalize_for_comparison(assembled)
227
+
228
+
229
+ def diff_context(original: str, assembled: str, context_size: int = DEFAULT_DIFF_CONTEXT_SIZE) -> str:
230
+ original_norm = normalize_for_comparison(original)
231
+ assembled_norm = normalize_for_comparison(assembled)
232
+
233
+ min_len = min(len(original_norm), len(assembled_norm))
234
+ diff_pos = None
235
+ for i in range(min_len):
236
+ if original_norm[i] != assembled_norm[i]:
237
+ diff_pos = i
238
+ break
239
+ if diff_pos is None and len(original_norm) != len(assembled_norm):
240
+ diff_pos = min_len
241
+ if diff_pos is None:
242
+ return "No differences found"
243
+
244
+ start = max(0, diff_pos - context_size)
245
+ end = diff_pos + context_size
246
+ return (
247
+ f"Difference at normalized position {diff_pos}:\n"
248
+ f"Original: ...{original_norm[start:end]}...\n"
249
+ f"Assembled: ...{assembled_norm[start:end]}..."
250
+ )
@@ -0,0 +1,11 @@
1
+ from .play import Act, Character, Dialogue, Monologue, Play, Scene, Speech
2
+
3
+ __all__ = [
4
+ "Act",
5
+ "Scene",
6
+ "Dialogue",
7
+ "Character",
8
+ "Speech",
9
+ "Monologue",
10
+ "Play",
11
+ ]