play-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. play_parser/__init__.py +32 -0
  2. play_parser/__main__.py +6 -0
  3. play_parser/_io.py +51 -0
  4. play_parser/cli/__init__.py +3 -0
  5. play_parser/cli/main.py +240 -0
  6. play_parser/document/__init__.py +21 -0
  7. play_parser/document/assembler.py +185 -0
  8. play_parser/document/builder.py +170 -0
  9. play_parser/document/constants.py +17 -0
  10. play_parser/document/text.py +15 -0
  11. play_parser/document/types.py +92 -0
  12. play_parser/document/validation.py +250 -0
  13. play_parser/domain/__init__.py +11 -0
  14. play_parser/domain/play.py +743 -0
  15. play_parser/ingestion/__init__.py +3 -0
  16. play_parser/ingestion/ingestor.py +181 -0
  17. play_parser/parsing/__init__.py +18 -0
  18. play_parser/parsing/context.py +103 -0
  19. play_parser/parsing/front_matter.py +86 -0
  20. play_parser/parsing/parser.py +292 -0
  21. play_parser/parsing/profiles/__init__.py +15 -0
  22. play_parser/parsing/profiles/builtins/__init__.py +1 -0
  23. play_parser/parsing/profiles/builtins/colon_inline.json +15 -0
  24. play_parser/parsing/profiles/builtins/dot_block.json +16 -0
  25. play_parser/parsing/profiles/builtins/dot_inline.json +16 -0
  26. play_parser/parsing/profiles/builtins/mixed_parenthetical.json +17 -0
  27. play_parser/parsing/profiles/builtins/narrative_stage_heavy.json +17 -0
  28. play_parser/parsing/profiles/loader.py +149 -0
  29. play_parser/parsing/profiles/schema.py +139 -0
  30. play_parser/parsing/speakers.py +130 -0
  31. play_parser/parsing/speech.py +604 -0
  32. play_parser/parsing/stage.py +178 -0
  33. play_parser/parsing/structure.py +87 -0
  34. play_parser/py.typed +0 -0
  35. play_parser-1.0.0.dist-info/METADATA +195 -0
  36. play_parser-1.0.0.dist-info/RECORD +40 -0
  37. play_parser-1.0.0.dist-info/WHEEL +5 -0
  38. play_parser-1.0.0.dist-info/entry_points.txt +2 -0
  39. play_parser-1.0.0.dist-info/licenses/LICENSE +21 -0
  40. play_parser-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,32 @@
1
+ from play_parser.document.assembler import assemble_play_text
2
+ from play_parser.document.validation import validate_play_document
3
+ from play_parser.domain import Act, Character, Dialogue, Monologue, Play, Scene, Speech
4
+ from play_parser.ingestion import PlayIngestor
5
+ from play_parser.parsing.profiles import (
6
+ FormatProfile,
7
+ get_format_profile,
8
+ list_format_profiles,
9
+ load_format_profile_config,
10
+ load_format_profile_file,
11
+ )
12
+
13
+ __version__ = "1.0.0"
14
+
15
+ __all__ = [
16
+ "__version__",
17
+ "Act",
18
+ "Character",
19
+ "Dialogue",
20
+ "FormatProfile",
21
+ "Monologue",
22
+ "Play",
23
+ "PlayIngestor",
24
+ "Scene",
25
+ "Speech",
26
+ "assemble_play_text",
27
+ "get_format_profile",
28
+ "list_format_profiles",
29
+ "load_format_profile_config",
30
+ "load_format_profile_file",
31
+ "validate_play_document",
32
+ ]
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from play_parser.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
play_parser/_io.py ADDED
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ DEFAULT_MAX_INPUT_BYTES = 10 * 1024 * 1024
8
+
9
+
10
+ def check_file_size(path: str | Path, *, max_bytes: int = DEFAULT_MAX_INPUT_BYTES) -> Path:
11
+ input_path = Path(path)
12
+ try:
13
+ size = input_path.stat().st_size
14
+ except FileNotFoundError as exc:
15
+ raise ValueError(f"File not found: {input_path}") from exc
16
+ except PermissionError as exc:
17
+ raise ValueError(f"Permission denied reading file: {input_path}") from exc
18
+
19
+ if size > max_bytes:
20
+ raise ValueError(
21
+ f"Input file is too large: {input_path} is {size} bytes; maximum supported size is {max_bytes} bytes"
22
+ )
23
+ return input_path
24
+
25
+
26
+ def read_text_file(path: str | Path, *, max_bytes: int = DEFAULT_MAX_INPUT_BYTES) -> str:
27
+ input_path = check_file_size(path, max_bytes=max_bytes)
28
+ try:
29
+ return input_path.read_text(encoding="utf-8")
30
+ except PermissionError as exc:
31
+ raise ValueError(f"Permission denied reading file: {input_path}") from exc
32
+ except UnicodeDecodeError as exc:
33
+ raise ValueError(f"File is not valid UTF-8 text: {input_path}") from exc
34
+
35
+
36
+ def read_json_file(path: str | Path, *, max_bytes: int = DEFAULT_MAX_INPUT_BYTES) -> Any:
37
+ input_path = Path(path)
38
+ text = read_text_file(input_path, max_bytes=max_bytes)
39
+ try:
40
+ return json.loads(text)
41
+ except json.JSONDecodeError as exc:
42
+ raise ValueError(
43
+ f"Invalid JSON in file {input_path}: {exc.msg} at line {exc.lineno}, column {exc.colno}"
44
+ ) from exc
45
+
46
+
47
+ def parse_json_text(text: str, *, label: str = "JSON text") -> Any:
48
+ try:
49
+ return json.loads(text)
50
+ except json.JSONDecodeError as exc:
51
+ raise ValueError(f"Invalid {label}: {exc.msg} at line {exc.lineno}, column {exc.colno}") from exc
@@ -0,0 +1,3 @@
1
+ from .main import main
2
+
3
+ __all__ = ["main"]
@@ -0,0 +1,240 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+
7
+ from play_parser import __version__
8
+ from play_parser.document.assembler import assemble_play_text
9
+ from play_parser.ingestion import PlayIngestor
10
+
11
+ DEFAULT_JSON_INDENT = 2
12
+
13
+
14
+ def build_cli_parser() -> argparse.ArgumentParser:
15
+ parser = argparse.ArgumentParser(
16
+ description="Parse raw play text to JSON/canonical text, or assemble canonical text from JSON."
17
+ )
18
+ parser.add_argument("--version", action="version", version=f"play-parser {__version__}")
19
+ subparsers = parser.add_subparsers(dest="command", required=True)
20
+ add_parse_command(subparsers)
21
+ add_assemble_command(subparsers)
22
+ return parser
23
+
24
+
25
+ def add_parse_command(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
26
+ parse_parser = subparsers.add_parser("parse", help="Parse raw .txt files into JSON and/or canonical text.")
27
+ parse_parser.add_argument(
28
+ "inputs",
29
+ nargs="*",
30
+ type=Path,
31
+ help="Input .txt files or folders. If omitted, all .txt files under --input-root are processed.",
32
+ )
33
+ parse_parser.add_argument(
34
+ "--input-root",
35
+ type=Path,
36
+ default=Path.cwd(),
37
+ help=(
38
+ "Default folder to read raw .txt files from when no explicit inputs are given. "
39
+ "Defaults to the current working directory."
40
+ ),
41
+ )
42
+ parse_parser.add_argument("--recursive", action="store_true", help="Traverse input directories recursively.")
43
+ parse_parser.add_argument("--json-output", type=Path, help="Write JSON for exactly one input file.")
44
+ parse_parser.add_argument("--json-output-root", type=Path, help="Write one .json file per input into this folder.")
45
+ parse_parser.add_argument(
46
+ "--text-output",
47
+ type=Path,
48
+ help="Write canonical assembled .txt for exactly one input file.",
49
+ )
50
+ parse_parser.add_argument(
51
+ "--text-output-root",
52
+ type=Path,
53
+ help="Write one canonical .txt file per input into this folder.",
54
+ )
55
+ parse_parser.add_argument("--profile", help="Built-in format profile name to use while parsing raw text.")
56
+ parse_parser.add_argument("--profile-path", type=Path, help="Path to a custom format profile JSON file.")
57
+ parse_parser.add_argument("--indent", type=int, default=DEFAULT_JSON_INDENT, help="JSON indentation level.")
58
+
59
+
60
+ def add_assemble_command(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
61
+ assemble_parser = subparsers.add_parser("assemble", help="Assemble canonical .txt from JSON files.")
62
+ assemble_parser.add_argument(
63
+ "inputs",
64
+ nargs="*",
65
+ type=Path,
66
+ help="Input .json files or folders. If omitted, all .json files under --input-root are processed.",
67
+ )
68
+ assemble_parser.add_argument(
69
+ "--input-root",
70
+ type=Path,
71
+ default=Path.cwd(),
72
+ help=(
73
+ "Default folder to read .json files from when no explicit inputs are given. "
74
+ "Defaults to the current working directory."
75
+ ),
76
+ )
77
+ assemble_parser.add_argument("--recursive", action="store_true", help="Traverse input directories recursively.")
78
+ assemble_parser.add_argument("--output", type=Path, help="Write canonical text for exactly one input file.")
79
+ assemble_parser.add_argument(
80
+ "--output-root",
81
+ type=Path,
82
+ help="Write one canonical .txt file per input into this folder.",
83
+ )
84
+
85
+
86
+ def resolve_input_files(
87
+ input_paths: list[Path],
88
+ input_root: Path,
89
+ suffix: str,
90
+ *,
91
+ recursive: bool = False,
92
+ ) -> list[Path]:
93
+ resolved: list[Path] = []
94
+
95
+ candidates = input_paths or [input_root]
96
+ for candidate in candidates:
97
+ path = resolve_candidate_path(candidate, input_root)
98
+ if not path.exists():
99
+ raise SystemExit(f"Input path not found: {path}")
100
+
101
+ if path.is_dir():
102
+ walker = path.rglob if recursive else path.glob
103
+ resolved.extend(sorted(item.resolve() for item in walker(f"*{suffix}") if item.is_file()))
104
+ continue
105
+
106
+ if path.suffix.lower() != suffix:
107
+ raise SystemExit(f"Expected {suffix} input, got: {path}")
108
+ resolved.append(path.resolve())
109
+
110
+ unique: list[Path] = []
111
+ seen: set[Path] = set()
112
+ for path in resolved:
113
+ if path not in seen:
114
+ unique.append(path)
115
+ seen.add(path)
116
+ return unique
117
+
118
+
119
+ def resolve_candidate_path(candidate: Path, input_root: Path) -> Path:
120
+ if candidate.is_absolute():
121
+ return candidate
122
+ if candidate.exists():
123
+ return candidate.resolve()
124
+ return (input_root / candidate).resolve()
125
+
126
+
127
+ def ensure_single_input_output(
128
+ single_output: Path | None,
129
+ output_root: Path | None,
130
+ input_files: list[Path],
131
+ flag: str,
132
+ ) -> None:
133
+ if single_output is not None and len(input_files) != 1:
134
+ raise SystemExit(f"{flag} can only be used when processing exactly one input file")
135
+ if single_output is not None and output_root is not None:
136
+ raise SystemExit(f"Cannot use {flag} together with its corresponding --*-root option")
137
+
138
+
139
+ def json_output_path_for(input_path: Path, args: argparse.Namespace) -> Path | None:
140
+ if args.json_output is not None:
141
+ return args.json_output.resolve()
142
+ if args.json_output_root is not None:
143
+ return args.json_output_root.resolve() / f"{input_path.stem}.json"
144
+ return None
145
+
146
+
147
+ def text_output_path_for(input_path: Path, args: argparse.Namespace, *, from_json: bool) -> Path | None:
148
+ if from_json:
149
+ if args.output is not None:
150
+ return args.output.resolve()
151
+ if args.output_root is not None:
152
+ return args.output_root.resolve() / f"{input_path.stem}.txt"
153
+ return None
154
+
155
+ if args.text_output is not None:
156
+ return args.text_output.resolve()
157
+ if args.text_output_root is not None:
158
+ return args.text_output_root.resolve() / f"{input_path.stem}.txt"
159
+ return None
160
+
161
+
162
+ def write_text(path: Path, text: str) -> None:
163
+ path.parent.mkdir(parents=True, exist_ok=True)
164
+ path.write_text(text, encoding="utf-8")
165
+
166
+
167
+ def write_json(path: Path, payload: object, *, indent: int) -> None:
168
+ path.parent.mkdir(parents=True, exist_ok=True)
169
+ path.write_text(json.dumps(payload, ensure_ascii=False, indent=indent) + "\n", encoding="utf-8")
170
+
171
+
172
+ def run_parse(args: argparse.Namespace) -> int:
173
+ input_root = args.input_root.resolve()
174
+ input_files = resolve_input_files(args.inputs, input_root, ".txt", recursive=args.recursive)
175
+ if not input_files:
176
+ raise SystemExit(f"No .txt files found under {input_root}")
177
+ if args.profile is not None and args.profile_path is not None:
178
+ raise SystemExit("Cannot use --profile together with --profile-path")
179
+
180
+ ensure_single_input_output(args.json_output, args.json_output_root, input_files, "--json-output")
181
+ ensure_single_input_output(args.text_output, args.text_output_root, input_files, "--text-output")
182
+
183
+ for input_path in input_files:
184
+ document = PlayIngestor(input_path, profile=args.profile, profile_path=args.profile_path).data
185
+ json_output = json_output_path_for(input_path, args)
186
+ text_output = text_output_path_for(input_path, args, from_json=False)
187
+
188
+ if json_output is not None:
189
+ write_json(json_output, document, indent=args.indent)
190
+ if text_output is not None:
191
+ write_text(text_output, assemble_play_text(document))
192
+
193
+ report_outputs("Parsed", input_path, json_output=json_output, text_output=text_output)
194
+
195
+ return 0
196
+
197
+
198
+ def run_assemble(args: argparse.Namespace) -> int:
199
+ input_root = args.input_root.resolve()
200
+ input_files = resolve_input_files(args.inputs, input_root, ".json", recursive=args.recursive)
201
+ if not input_files:
202
+ raise SystemExit(f"No .json files found under {input_root}")
203
+
204
+ ensure_single_input_output(args.output, args.output_root, input_files, "--output")
205
+
206
+ for input_path in input_files:
207
+ text_output = text_output_path_for(input_path, args, from_json=True)
208
+ assembled = assemble_play_text(input_path)
209
+ if text_output is not None:
210
+ write_text(text_output, assembled)
211
+ report_outputs("Assembled", input_path, text_output=text_output)
212
+
213
+ return 0
214
+
215
+
216
+ def report_outputs(
217
+ action: str,
218
+ input_path: Path,
219
+ *,
220
+ json_output: Path | None = None,
221
+ text_output: Path | None = None,
222
+ ) -> None:
223
+ outputs: list[str] = []
224
+ if json_output is not None:
225
+ outputs.append(f"json={json_output}")
226
+ if text_output is not None:
227
+ outputs.append(f"text={text_output}")
228
+ if outputs:
229
+ print(f"{action} {input_path} -> " + ", ".join(outputs))
230
+ else:
231
+ print(f"{action} {input_path} (no files written)")
232
+
233
+
234
+ def main() -> int:
235
+ args = build_cli_parser().parse_args()
236
+ if args.command == "parse":
237
+ return run_parse(args)
238
+ if args.command == "assemble":
239
+ return run_assemble(args)
240
+ raise SystemExit(f"Unsupported command: {args.command}")
@@ -0,0 +1,21 @@
1
+ from .assembler import assemble_play_text
2
+ from .builder import (
3
+ build_play_document,
4
+ build_play_document_from_file,
5
+ build_play_events,
6
+ build_play_events_from_file,
7
+ write_json_file,
8
+ )
9
+ from .validation import validate_events, validate_play_document, validate_roundtrip
10
+
11
+ __all__ = [
12
+ "assemble_play_text",
13
+ "build_play_document",
14
+ "build_play_document_from_file",
15
+ "build_play_events",
16
+ "build_play_events_from_file",
17
+ "write_json_file",
18
+ "validate_events",
19
+ "validate_play_document",
20
+ "validate_roundtrip",
21
+ ]
@@ -0,0 +1,185 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Sequence
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from play_parser._io import parse_json_text, read_json_file
8
+
9
+ from .types import PlayDocument, PlayEvent, SpeechEvent
10
+
11
+
12
+ def assemble_play_text(events_or_json: Sequence[PlayEvent] | PlayDocument | str | Path) -> str:
13
+ events = _coerce_events(events_or_json)
14
+ return PlayTextAssembler().assemble(events)
15
+
16
+
17
+ class PlayTextAssembler:
18
+ """Formats canonical play events back into plain text."""
19
+
20
+ def __init__(self) -> None:
21
+ self.pending_attached_stage_directions: list[str] = []
22
+ self.last_speech_speaker: str | None = None
23
+ self.last_speech_index = -1
24
+ self.scene_boundary_indices: set[int] = set()
25
+
26
+ def assemble(self, events: Sequence[PlayEvent]) -> str:
27
+ blocks: list[str] = []
28
+ self.scene_boundary_indices = {
29
+ index for index, event in enumerate(events) if event["type"] in {"act_start", "scene_start"}
30
+ }
31
+
32
+ for index, event in enumerate(events):
33
+ if event["type"] == "stage_direction" and event.get("attachment") == "next":
34
+ self.pending_attached_stage_directions.append(event["text"])
35
+ continue
36
+
37
+ block = self.format_event(event, events=events, index=index)
38
+ if block:
39
+ blocks.append(block)
40
+
41
+ if self.pending_attached_stage_directions:
42
+ blocks.extend(self.pending_attached_stage_directions)
43
+ self.pending_attached_stage_directions = []
44
+
45
+ return "\n\n".join(block for block in blocks if block) + "\n"
46
+
47
+ def format_event(self, event: PlayEvent, *, events: Sequence[PlayEvent], index: int) -> str:
48
+ if event["type"] == "speech":
49
+ return self.format_speech(event, events=events, index=index)
50
+
51
+ block = self.format_non_speech(event)
52
+ self.pending_attached_stage_directions = []
53
+ if event["type"] in {"act_start", "scene_start"}:
54
+ self.last_speech_speaker = None
55
+ self.last_speech_index = -1
56
+ return block
57
+
58
+ def format_speech(self, event: SpeechEvent, *, events: Sequence[PlayEvent], index: int) -> str:
59
+ suppress_speaker, bare_speaker_label = self.speaker_label_options(event, events=events, index=index)
60
+ block = _event_to_block(
61
+ event,
62
+ suppress_speaker=suppress_speaker,
63
+ bare_speaker_label=bare_speaker_label,
64
+ attached_stage_directions=self.pending_attached_stage_directions,
65
+ )
66
+ self.pending_attached_stage_directions = []
67
+ self.last_speech_speaker = event["speaker"]
68
+ self.last_speech_index = index
69
+ return block
70
+
71
+ def speaker_label_options(
72
+ self,
73
+ event: SpeechEvent,
74
+ *,
75
+ events: Sequence[PlayEvent],
76
+ index: int,
77
+ ) -> tuple[bool, bool]:
78
+ explicit_labelled = event.get("labelled")
79
+ if explicit_labelled is False:
80
+ should_suppress = self.last_speech_speaker == event["speaker"]
81
+ return should_suppress, not should_suppress
82
+
83
+ if explicit_labelled is None and self.can_suppress_repeated_speaker(event, events=events, index=index):
84
+ return True, False
85
+
86
+ return False, False
87
+
88
+ def can_suppress_repeated_speaker(
89
+ self,
90
+ event: SpeechEvent,
91
+ *,
92
+ events: Sequence[PlayEvent],
93
+ index: int,
94
+ ) -> bool:
95
+ if self.last_speech_speaker != event["speaker"]:
96
+ return False
97
+ if index <= 0 or index - 1 in self.scene_boundary_indices or self.last_speech_index < 0:
98
+ return False
99
+ return all(
100
+ other_event["type"] == "stage_direction" for other_event in events[self.last_speech_index + 1 : index]
101
+ )
102
+
103
+ def format_non_speech(self, event: PlayEvent) -> str:
104
+ return _event_to_block(
105
+ event,
106
+ attached_stage_directions=self.pending_attached_stage_directions,
107
+ )
108
+
109
+
110
+ def _coerce_events(events_or_json: Sequence[PlayEvent] | PlayDocument | str | Path) -> list[PlayEvent]:
111
+ if isinstance(events_or_json, Path):
112
+ return _extract_events(read_json_file(events_or_json))
113
+
114
+ if isinstance(events_or_json, str):
115
+ return _extract_events(parse_json_text(events_or_json))
116
+
117
+ return _extract_events(events_or_json)
118
+
119
+
120
+ def _extract_events(payload: Sequence[PlayEvent] | PlayDocument | Any) -> list[PlayEvent]:
121
+ if isinstance(payload, dict) and "events" in payload:
122
+ return list(payload["events"])
123
+ return list(payload)
124
+
125
+
126
+ def _event_to_block(
127
+ event: PlayEvent,
128
+ suppress_speaker: bool = False,
129
+ bare_speaker_label: bool = False,
130
+ attached_stage_directions: list[str] | None = None,
131
+ ) -> str:
132
+ attached_stage_directions = attached_stage_directions or []
133
+ event_type = event["type"]
134
+
135
+ if event_type == "speech":
136
+ lines = event["text"].splitlines() if event["text"] else [""]
137
+ first_line = lines[0] if lines else ""
138
+ remainder = lines[1:]
139
+
140
+ prefix = "".join(attached_stage_directions)
141
+
142
+ if bare_speaker_label:
143
+ speech_lines = lines if event["text"] else []
144
+ speaker_label = event["speaker"].upper()
145
+ if prefix:
146
+ return "\n".join([speaker_label, prefix, *speech_lines])
147
+ return "\n".join([speaker_label, *speech_lines])
148
+
149
+ if suppress_speaker:
150
+ head = f"{prefix} {first_line}".strip() if prefix else first_line
151
+ else:
152
+ if prefix:
153
+ head = f"{event['speaker']}: {prefix} {first_line}".rstrip()
154
+ else:
155
+ head = f"{event['speaker']}: {first_line}".rstrip()
156
+
157
+ if not remainder:
158
+ return head
159
+ return "\n".join([head, *remainder])
160
+
161
+ if event_type == "scene_start":
162
+ base = event["label"]
163
+ if attached_stage_directions:
164
+ return "\n".join([*attached_stage_directions, base])
165
+ return base
166
+
167
+ if event_type == "act_start":
168
+ base = event["label"]
169
+ if attached_stage_directions:
170
+ return "\n".join([*attached_stage_directions, base])
171
+ return base
172
+
173
+ if event_type == "stage_direction":
174
+ base = event["text"]
175
+ if attached_stage_directions:
176
+ return "\n".join([*attached_stage_directions, base])
177
+ return base
178
+
179
+ if event_type == "meta":
180
+ base = event["text"]
181
+ if attached_stage_directions:
182
+ return "\n".join([*attached_stage_directions, base])
183
+ return base
184
+
185
+ return ""