oaknut-basic 12.10.0__tar.gz → 12.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {oaknut_basic-12.10.0/src/oaknut_basic.egg-info → oaknut_basic-12.11.0}/PKG-INFO +1 -1
  2. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/__init__.py +5 -1
  3. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/cli.py +18 -2
  4. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/detokeniser.py +14 -5
  5. oaknut_basic-12.11.0/src/oaknut/basic/dialect.py +164 -0
  6. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/numbering.py +27 -1
  7. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/scanner.py +27 -9
  8. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/tokeniser.py +7 -10
  9. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0/src/oaknut_basic.egg-info}/PKG-INFO +1 -1
  10. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut_basic.egg-info/SOURCES.txt +2 -0
  11. oaknut_basic-12.11.0/tests/test_basic_v.py +195 -0
  12. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_cli.py +26 -0
  13. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_numbering.py +7 -0
  14. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/LICENSE +0 -0
  15. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/README.md +0 -0
  16. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/pyproject.toml +0 -0
  17. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/setup.cfg +0 -0
  18. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/datafile.py +0 -0
  19. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/exceptions.py +0 -0
  20. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/float5.py +0 -0
  21. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/linenumber.py +0 -0
  22. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut/basic/tokens.py +0 -0
  23. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut_basic.egg-info/dependency_links.txt +0 -0
  24. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut_basic.egg-info/entry_points.txt +0 -0
  25. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut_basic.egg-info/requires.txt +0 -0
  26. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/src/oaknut_basic.egg-info/top_level.txt +0 -0
  27. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_basic.py +0 -0
  28. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_crunch_rules.py +0 -0
  29. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_data_cli.py +0 -0
  30. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_datafile.py +0 -0
  31. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_detokeniser.py +0 -0
  32. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_float5.py +0 -0
  33. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_keyword_coverage.py +0 -0
  34. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_linenumber.py +0 -0
  35. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_rom_golden.py +0 -0
  36. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_rom_golden_detokenise.py +0 -0
  37. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_scanner.py +0 -0
  38. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_tokeniser.py +0 -0
  39. {oaknut_basic-12.10.0 → oaknut_basic-12.11.0}/tests/test_tokens.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oaknut-basic
3
- Version: 12.10.0
3
+ Version: 12.11.0
4
4
  Summary: BBC BASIC tools: program tokeniser/de-tokeniser and PRINT#/INPUT# data-file reader/writer
5
5
  Author-email: Robert Smallshire <robert@smallshire.org.uk>
6
6
  License-Expression: MIT
@@ -31,6 +31,7 @@ from oaknut.basic.datafile import (
31
31
  BbcBasicDataWriter,
32
32
  )
33
33
  from oaknut.basic.detokeniser import detokenise, detokenise_body
34
+ from oaknut.basic.dialect import BASIC_II, BASIC_V, Dialect
34
35
  from oaknut.basic.exceptions import (
35
36
  AlreadyNumberedError,
36
37
  BASICError,
@@ -79,7 +80,7 @@ from oaknut.basic.tokens import (
79
80
  TOKEN_TO_KEYWORD,
80
81
  )
81
82
 
82
- __version__ = "12.10.0"
83
+ __version__ = "12.11.0"
83
84
 
84
85
  # Canonical load addresses for BBC BASIC programs on each host.
85
86
  # Programs saved by *SAVE on a real machine use these by default.
@@ -87,6 +88,8 @@ BBC_BASIC_LOAD_ADDRESS = 0x1900
87
88
  ELECTRON_BASIC_LOAD_ADDRESS = 0x0E00
88
89
 
89
90
  __all__ = [
91
+ "BASIC_II",
92
+ "BASIC_V",
90
93
  "BBC_BASIC_LOAD_ADDRESS",
91
94
  "DEFAULT_LINE_NUMBER",
92
95
  "DEFAULT_LINE_STEP",
@@ -110,6 +113,7 @@ __all__ = [
110
113
  "DataFileError",
111
114
  "DataFileTypeMismatchError",
112
115
  "DetokeniseError",
116
+ "Dialect",
113
117
  "Float5RangeError",
114
118
  "IntegerRangeError",
115
119
  "InvalidLineLengthError",
@@ -260,7 +260,16 @@ def tokenise(
260
260
  help='Text encoding for the OUTPUT source. Use "acorn" for the BBC '
261
261
  "character set with CR line endings (e.g. writing back to a disc image).",
262
262
  )
263
- def detokenise(input_stream, output_stream, encoding: str) -> None:
263
+ @click.option(
264
+ "--dialect",
265
+ type=click.Choice(["ii", "2", "v", "5"]),
266
+ default="ii",
267
+ show_default=True,
268
+ help='BBC BASIC dialect, as a Roman or Arabic numeral. Use "v" (or "5") for '
269
+ "Archimedes / RISC OS programs, so the &C6/&C7/&C8 escape tokens "
270
+ "(CASE, SYS, ORIGIN, ...) decode correctly.",
271
+ )
272
+ def detokenise(input_stream, output_stream, encoding: str, dialect: str) -> None:
264
273
  """De-tokenise a stored BBC BASIC program into source text.
265
274
 
266
275
  Reads a tokenised program from INPUT and writes numbered source text
@@ -273,10 +282,17 @@ def detokenise(input_stream, output_stream, encoding: str) -> None:
273
282
  OUTPUT is written in --encoding (``utf-8`` with host-native ``LF`` line
274
283
  endings by default, for a host text file; pass ``acorn`` for the BBC
275
284
  character set with ``CR`` endings, e.g. writing back to a disc image).
285
+
286
+ Pass ``--dialect v`` (or ``--dialect 5``) for Archimedes / RISC OS
287
+ programs (BBC BASIC V), whose extended keywords use the &C6/&C7/&C8
288
+ two-byte escape tokens. The Roman (``ii``/``v``) and Arabic
289
+ (``2``/``5``) numerals are interchangeable.
276
290
  """
291
+ from oaknut.basic import BASIC_II, BASIC_V
277
292
  from oaknut.basic import detokenise as detokenise_program
278
293
 
279
- listing = detokenise_program(input_stream.read())
294
+ selected_dialect = BASIC_V if dialect in ("v", "5") else BASIC_II
295
+ listing = detokenise_program(input_stream.read(), dialect=selected_dialect)
280
296
  output_stream.write(_listing_to_bytes(listing, encoding))
281
297
 
282
298
 
@@ -26,15 +26,21 @@ including) the next one, so ``length = 4 + len(body)``.
26
26
 
27
27
  from __future__ import annotations
28
28
 
29
+ from oaknut.basic.dialect import BASIC_II, Dialect
29
30
  from oaknut.basic.scanner import scan, scan_program
30
31
 
31
32
 
32
- def detokenise(data: bytes) -> str:
33
- """De-tokenise a BBC BASIC II program into source text.
33
+ def detokenise(data: bytes, *, dialect: Dialect = BASIC_II) -> str:
34
+ """De-tokenise a stored BBC BASIC program into source text.
34
35
 
35
36
  Args:
36
37
  data: A tokenised program, as stored on disc or in memory,
37
38
  terminated by the ``&0D &FF`` end marker.
39
+ dialect: The BBC BASIC variant whose token tables to use.
40
+ Defaults to :data:`~oaknut.basic.BASIC_II`; pass
41
+ :data:`~oaknut.basic.BASIC_V` for Archimedes / RISC OS
42
+ programs so the ``&C6``/``&C7``/``&C8`` escape tokens and the
43
+ re-purposed single-byte tokens decode correctly.
38
44
 
39
45
  Returns:
40
46
  The program as numbered source text, lines separated by ``"\\n"``
@@ -47,11 +53,11 @@ def detokenise(data: bytes) -> str:
47
53
  """
48
54
  return "".join(
49
55
  f"{record.line_number}{''.join(token.value for token in record.tokens)}\n"
50
- for record in scan_program(data)
56
+ for record in scan_program(data, dialect=dialect)
51
57
  )
52
58
 
53
59
 
54
- def detokenise_body(data: bytes) -> str:
60
+ def detokenise_body(data: bytes, *, dialect: Dialect = BASIC_II) -> str:
55
61
  """De-tokenise an unframed line body into text.
56
62
 
57
63
  Unlike :func:`detokenise`, this expects a line *body* — inline token
@@ -62,6 +68,9 @@ def detokenise_body(data: bytes) -> str:
62
68
 
63
69
  Args:
64
70
  data: A line body of inline token bytes.
71
+ dialect: The BBC BASIC variant whose token tables to use, passed
72
+ through to :func:`scan`. Defaults to
73
+ :data:`~oaknut.basic.BASIC_II`.
65
74
 
66
75
  Returns:
67
76
  The body as source text, using latin-1/code-point semantics so it
@@ -70,4 +79,4 @@ def detokenise_body(data: bytes) -> str:
70
79
  Raises:
71
80
  DetokeniseError: A ``&8D`` line-number reference is truncated.
72
81
  """
73
- return "".join(token.value for token in scan(data))
82
+ return "".join(token.value for token in scan(data, dialect=dialect))
@@ -0,0 +1,164 @@
1
+ """BBC BASIC dialects for de-tokenisation.
2
+
3
+ The :mod:`~oaknut.basic.tokens` table is **BBC BASIC II** — the BBC
4
+ Micro's 8-bit language ROM. BBC BASIC V (the Archimedes / RISC OS ARM
5
+ BASIC) keeps that table but extends it two ways, and de-tokenising a
6
+ BASIC V program with the BASIC II table corrupts every extended keyword:
7
+
8
+ - Three BASIC II command tokens — ``&C6``, ``&C7``, ``&C8`` (``AUTO``,
9
+ ``DELETE``, ``LOAD``) — become **two-byte escape prefixes**. The byte
10
+ that follows selects an extended keyword, with the second byte counting
11
+ from ``&8E`` in each table: ``&C6`` for the extended *functions*
12
+ (``SUM``, ``BEAT``), ``&C7`` for the extended *commands* (``APPEND`` …
13
+ ``RENUMBER`` … ``INSTALL``), and ``&C8`` for the extended *statements*
14
+ (``CASE``, ``CIRCLE``, ``ORIGIN``, ``SYS`` …).
15
+
16
+ - Several single-byte slots are re-purposed. ``&7F`` becomes
17
+ ``OTHERWISE``, and ``&C9``-``&CE`` — ``LIST``/``NEW``/``OLD``/
18
+ ``RENUMBER``/``SAVE`` and an unused gap in BASIC II — become the block
19
+ keywords ``WHEN``/``OF``/``ENDCASE``/``ELSE``/``ENDIF``/``ENDWHILE``
20
+ (the command spellings move into the ``&C7`` escape table).
21
+
22
+ A :class:`Dialect` bundles the single-byte token map with the escape
23
+ tables, so the scanner and de-tokeniser can target either language with
24
+ one parameter. :data:`BASIC_II` is the default everywhere — code that
25
+ does not name a dialect keeps its existing BASIC II behaviour.
26
+
27
+ The token values were cross-checked against two independent
28
+ implementations: Justin Fletcher's de-tokeniser tables (the
29
+ ``gerph/riscos-basic-detokenise`` ``c/basic`` source) and Steve Fryatt's
30
+ ``tokenize`` (``src/parse.c``), which agree byte-for-byte.
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ from collections.abc import Mapping
36
+ from dataclasses import dataclass
37
+ from types import MappingProxyType
38
+
39
+ from oaknut.basic.tokens import TOKEN_TO_KEYWORD
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class Dialect:
44
+ """A BBC BASIC variant's token-to-keyword mapping.
45
+
46
+ Attributes:
47
+ name: Human-readable dialect name (e.g. ``"BBC BASIC V"``).
48
+ single_byte: Maps a single token byte to its keyword spelling.
49
+ Covers the whole one-byte keyword range, including the few
50
+ sub-``&80`` tokens such as ``&7F`` (``OTHERWISE``).
51
+ escape: Maps a two-byte escape *prefix* (``&C6``/``&C7``/``&C8``
52
+ in BASIC V) to a table from the *following* byte to its
53
+ extended keyword. Empty for dialects without escape tokens.
54
+ """
55
+
56
+ name: str
57
+ single_byte: Mapping[int, str]
58
+ escape: Mapping[int, Mapping[int, str]]
59
+
60
+
61
+ # --- BBC BASIC II -----------------------------------------------------------
62
+
63
+ #: BBC BASIC II — the BBC Micro's 8-bit language ROM. No escape tokens;
64
+ #: the single-byte map is the canonical :data:`TOKEN_TO_KEYWORD` table.
65
+ BASIC_II = Dialect(name="BBC BASIC II", single_byte=TOKEN_TO_KEYWORD, escape=MappingProxyType({}))
66
+
67
+
68
+ # --- BBC BASIC V ------------------------------------------------------------
69
+
70
+ # The extended-token tables, keyed by the byte that follows the prefix.
71
+ # Each runs from &8E upward, in ROM order.
72
+ _EXTENDED_FUNCTIONS = ("SUM", "BEAT")
73
+ _EXTENDED_COMMANDS = (
74
+ "APPEND",
75
+ "AUTO",
76
+ "CRUNCH",
77
+ "DELETE",
78
+ "EDIT",
79
+ "HELP",
80
+ "LIST",
81
+ "LOAD",
82
+ "LVAR",
83
+ "NEW",
84
+ "OLD",
85
+ "RENUMBER",
86
+ "SAVE",
87
+ "TEXTLOAD",
88
+ "TEXTSAVE",
89
+ "TWIN",
90
+ "TWINO",
91
+ "INSTALL",
92
+ )
93
+ _EXTENDED_STATEMENTS = (
94
+ "CASE",
95
+ "CIRCLE",
96
+ "FILL",
97
+ "ORIGIN",
98
+ "POINT",
99
+ "RECTANGLE",
100
+ "SWAP",
101
+ "WHILE",
102
+ "WAIT",
103
+ "MOUSE",
104
+ "QUIT",
105
+ "SYS",
106
+ "INSTALL",
107
+ "LIBRARY",
108
+ "TINT",
109
+ "ELLIPSE",
110
+ "BEATS",
111
+ "TEMPO",
112
+ "VOICES",
113
+ "VOICE",
114
+ "STEREO",
115
+ "OVERLAY",
116
+ )
117
+
118
+ _EXTENDED_TOKEN_BASE = 0x8E
119
+
120
+
121
+ def _extended_table(keywords: tuple[str, ...]) -> Mapping[int, str]:
122
+ """Build a {second byte -> keyword} table counting from ``&8E``."""
123
+ return MappingProxyType(
124
+ {_EXTENDED_TOKEN_BASE + offset: keyword for offset, keyword in enumerate(keywords)}
125
+ )
126
+
127
+
128
+ # Single-byte tokens BASIC V re-purposes from BASIC II. &7F is new;
129
+ # &C9-&CE displace the LIST/NEW/OLD/RENUMBER/SAVE command tokens (whose
130
+ # spellings move into the &C7 escape table) and the unused &CE gap.
131
+ _BASIC_V_SINGLE_BYTE_OVERRIDES = {
132
+ 0x7F: "OTHERWISE",
133
+ 0xC9: "WHEN",
134
+ 0xCA: "OF",
135
+ 0xCB: "ENDCASE",
136
+ 0xCC: "ELSE",
137
+ 0xCD: "ENDIF",
138
+ 0xCE: "ENDWHILE",
139
+ }
140
+
141
+ # Start from BASIC II, drop the three bytes that are now escape prefixes
142
+ # (so a bare prefix never resolves to AUTO/DELETE/LOAD), then apply the
143
+ # re-purposed single-byte slots.
144
+ _basic_v_single_byte = {
145
+ token: keyword
146
+ for token, keyword in TOKEN_TO_KEYWORD.items()
147
+ if token not in (0xC6, 0xC7, 0xC8)
148
+ }
149
+ _basic_v_single_byte.update(_BASIC_V_SINGLE_BYTE_OVERRIDES)
150
+
151
+ #: BBC BASIC V — the Archimedes / RISC OS ARM BASIC. Adds the
152
+ #: ``&C6``/``&C7``/``&C8`` escape tables and the re-purposed single-byte
153
+ #: tokens described in this module's docstring.
154
+ BASIC_V = Dialect(
155
+ name="BBC BASIC V",
156
+ single_byte=MappingProxyType(_basic_v_single_byte),
157
+ escape=MappingProxyType(
158
+ {
159
+ 0xC6: _extended_table(_EXTENDED_FUNCTIONS),
160
+ 0xC7: _extended_table(_EXTENDED_COMMANDS),
161
+ 0xC8: _extended_table(_EXTENDED_STATEMENTS),
162
+ }
163
+ ),
164
+ )
@@ -14,9 +14,35 @@ stream plumbing on top.
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import re
18
+
17
19
  DEFAULT_LINE_NUMBER = 10
18
20
  DEFAULT_LINE_STEP = 10
19
21
 
22
+ # A BBC BASIC line ends only at a carriage return (or a host newline);
23
+ # Python's str.splitlines() additionally breaks on &0B, &0C, &1C-&1E and
24
+ # others, which occur legitimately inside string literals (mode-7 / VDU
25
+ # control codes), so split strictly on the real terminators only. This
26
+ # definition is shared with the tokeniser, which imports it from here.
27
+ LINE_SEPARATOR_RE = re.compile(r"\r\n|\r|\n")
28
+
29
+
30
+ def split_source_lines(source: str) -> list[str]:
31
+ """Split BBC BASIC source into lines on real terminators only.
32
+
33
+ Matches :meth:`str.splitlines` semantics for the terminators a BBC
34
+ BASIC line actually ends on (``"\\n"``, ``"\\r"``, ``"\\r\\n"``) — an
35
+ empty *source* yields no lines and a single trailing terminator is
36
+ not treated as introducing a final empty line — but, unlike
37
+ :meth:`str.splitlines`, never breaks on the in-string control codes
38
+ (&0B, &0C, &1C-&1E, ...) that BBC BASIC string literals legitimately
39
+ contain.
40
+ """
41
+ lines = LINE_SEPARATOR_RE.split(source)
42
+ if lines and lines[-1] == "":
43
+ lines.pop()
44
+ return lines
45
+
20
46
 
21
47
  def number_lines(
22
48
  source: str,
@@ -54,7 +80,7 @@ def number_lines(
54
80
 
55
81
  number = start
56
82
  numbered = []
57
- for line in source.splitlines():
83
+ for line in split_source_lines(source):
58
84
  numbered.append(f"{number} {line}")
59
85
  number += step
60
86
 
@@ -26,13 +26,14 @@ import enum
26
26
  from collections.abc import Iterator
27
27
  from dataclasses import dataclass
28
28
 
29
+ from oaknut.basic.dialect import BASIC_II, Dialect
29
30
  from oaknut.basic.exceptions import (
30
31
  InvalidLineLengthError,
31
32
  MissingLineMarkerError,
32
33
  TruncatedProgramError,
33
34
  )
34
35
  from oaknut.basic.linenumber import decode_line_number
35
- from oaknut.basic.tokens import HEADER_LENGTH, LINE_NUMBER_TOKEN, TOKEN_TO_KEYWORD
36
+ from oaknut.basic.tokens import HEADER_LENGTH, LINE_NUMBER_TOKEN
36
37
 
37
38
  _CR = 0x0D
38
39
  _END_MARKER = 0xFF
@@ -90,7 +91,7 @@ class LineRecord:
90
91
  start: int
91
92
 
92
93
 
93
- def scan(data: bytes, *, base_offset: int = 0) -> Iterator[Token]:
94
+ def scan(data: bytes, *, base_offset: int = 0, dialect: Dialect = BASIC_II) -> Iterator[Token]:
94
95
  """Scan an unframed line body into a stream of :class:`Token`.
95
96
 
96
97
  Args:
@@ -99,10 +100,17 @@ def scan(data: bytes, *, base_offset: int = 0) -> Iterator[Token]:
99
100
  base_offset: Added to every token's :attr:`~Token.start`, so
100
101
  offsets can be reported against a larger buffer (for example
101
102
  the line body's position within a whole program).
103
+ dialect: The BBC BASIC variant whose token tables to use.
104
+ Defaults to :data:`~oaknut.basic.BASIC_II`; pass
105
+ :data:`~oaknut.basic.BASIC_V` to resolve the
106
+ ``&C6``/``&C7``/``&C8`` two-byte escape tokens and the
107
+ re-purposed single-byte tokens of the Archimedes language.
102
108
 
103
109
  Yields:
104
110
  One :class:`Token` per keyword, literal run, string literal, and
105
- ``&8D`` line-number reference, in source order.
111
+ ``&8D`` line-number reference, in source order. For a two-byte
112
+ escape keyword the token's :attr:`~Token.token` is the prefix
113
+ byte and its :attr:`~Token.start` is the prefix's offset.
106
114
 
107
115
  Raises:
108
116
  DetokeniseError: A ``&8D`` reference is truncated.
@@ -141,13 +149,20 @@ def scan(data: bytes, *, base_offset: int = 0) -> Iterator[Token]:
141
149
  yield Token(TokenKind.LINENUM, str(decode_line_number(payload)), base_offset + i)
142
150
  i += 4
143
151
  continue
144
- if b >= 0x80 and b in TOKEN_TO_KEYWORD:
152
+ if b in dialect.escape and i + 1 < n and data[i + 1] in dialect.escape[b]:
145
153
  yield from flush()
146
- yield Token(TokenKind.KEYWORD, TOKEN_TO_KEYWORD[b], base_offset + i, b)
154
+ keyword = dialect.escape[b][data[i + 1]]
155
+ yield Token(TokenKind.KEYWORD, keyword, base_offset + i, b)
156
+ i += 2
157
+ continue
158
+ if b in dialect.single_byte:
159
+ yield from flush()
160
+ yield Token(TokenKind.KEYWORD, dialect.single_byte[b], base_offset + i, b)
147
161
  i += 1
148
162
  continue
149
- # A literal byte (< &80) or an unknown high byte (e.g. the &CE
150
- # gap): coalesce into a TEXT run so the byte still round-trips.
163
+ # A literal byte, an unknown high byte, or an escape prefix whose
164
+ # following byte has no extended-token meaning: coalesce into a
165
+ # TEXT run so the byte still round-trips.
151
166
  if text_start < 0:
152
167
  text_start = i
153
168
  text_chars.append(chr(b))
@@ -155,12 +170,15 @@ def scan(data: bytes, *, base_offset: int = 0) -> Iterator[Token]:
155
170
  yield from flush()
156
171
 
157
172
 
158
- def scan_program(data: bytes) -> Iterator[LineRecord]:
173
+ def scan_program(data: bytes, *, dialect: Dialect = BASIC_II) -> Iterator[LineRecord]:
159
174
  """Walk a ``&0D``-framed stored program, scanning each line body.
160
175
 
161
176
  Args:
162
177
  data: A tokenised program as stored on disc or in memory,
163
178
  terminated by the ``&0D &FF`` end marker.
179
+ dialect: The BBC BASIC variant whose token tables to use, passed
180
+ through to :func:`scan`. Defaults to
181
+ :data:`~oaknut.basic.BASIC_II`.
164
182
 
165
183
  Yields:
166
184
  One :class:`LineRecord` per line, in storage order, each carrying
@@ -187,6 +205,6 @@ def scan_program(data: bytes) -> Iterator[LineRecord]:
187
205
  if length < HEADER_LENGTH or i + length > n:
188
206
  raise InvalidLineLengthError(offset=i, length=length)
189
207
  body = data[i + HEADER_LENGTH : i + length]
190
- tokens = tuple(scan(body, base_offset=i + HEADER_LENGTH))
208
+ tokens = tuple(scan(body, base_offset=i + HEADER_LENGTH, dialect=dialect))
191
209
  yield LineRecord(line_number=line_number, tokens=tokens, start=i)
192
210
  i += length
@@ -27,8 +27,6 @@ the CLI does it at its I/O boundary.
27
27
 
28
28
  from __future__ import annotations
29
29
 
30
- import re
31
-
32
30
  from oaknut.basic.exceptions import (
33
31
  AlreadyNumberedError,
34
32
  LineNumberOrderError,
@@ -37,7 +35,12 @@ from oaknut.basic.exceptions import (
37
35
  UnnumberedLineError,
38
36
  )
39
37
  from oaknut.basic.linenumber import MAX_LINE_NUMBER, encode_line_number
40
- from oaknut.basic.numbering import DEFAULT_LINE_NUMBER, DEFAULT_LINE_STEP, number_lines
38
+ from oaknut.basic.numbering import (
39
+ DEFAULT_LINE_NUMBER,
40
+ DEFAULT_LINE_STEP,
41
+ LINE_SEPARATOR_RE,
42
+ number_lines,
43
+ )
41
44
  from oaknut.basic.tokens import (
42
45
  FLAG_CONDITIONAL,
43
46
  FLAG_FN_PROC,
@@ -56,15 +59,9 @@ from oaknut.basic.tokens import (
56
59
  _CR = 0x0D
57
60
  _END_MARKER = 0xFF
58
61
 
59
- # A BBC BASIC line ends only at a carriage return (or a host newline);
60
- # Python's str.splitlines() additionally breaks on &0B, &0C, &1C-&1E and
61
- # others, which occur legitimately inside string literals (mode-7 / VDU
62
- # control codes), so split strictly on the real terminators only.
63
- _LINE_SEPARATOR_RE = re.compile(r"\r\n|\r|\n")
64
-
65
62
 
66
63
  def _split_source_lines(source: str) -> list[str]:
67
- return _LINE_SEPARATOR_RE.split(source)
64
+ return LINE_SEPARATOR_RE.split(source)
68
65
 
69
66
  # Keyword entries grouped by first character, preserving ROM order within
70
67
  # each group, so the crunch only scans the relevant group.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oaknut-basic
3
- Version: 12.10.0
3
+ Version: 12.11.0
4
4
  Summary: BBC BASIC tools: program tokeniser/de-tokeniser and PRINT#/INPUT# data-file reader/writer
5
5
  Author-email: Robert Smallshire <robert@smallshire.org.uk>
6
6
  License-Expression: MIT
@@ -5,6 +5,7 @@ src/oaknut/basic/__init__.py
5
5
  src/oaknut/basic/cli.py
6
6
  src/oaknut/basic/datafile.py
7
7
  src/oaknut/basic/detokeniser.py
8
+ src/oaknut/basic/dialect.py
8
9
  src/oaknut/basic/exceptions.py
9
10
  src/oaknut/basic/float5.py
10
11
  src/oaknut/basic/linenumber.py
@@ -19,6 +20,7 @@ src/oaknut_basic.egg-info/entry_points.txt
19
20
  src/oaknut_basic.egg-info/requires.txt
20
21
  src/oaknut_basic.egg-info/top_level.txt
21
22
  tests/test_basic.py
23
+ tests/test_basic_v.py
22
24
  tests/test_cli.py
23
25
  tests/test_crunch_rules.py
24
26
  tests/test_data_cli.py
@@ -0,0 +1,195 @@
1
+ """Tests for the BBC BASIC V (Archimedes/RISC OS) detokenising dialect.
2
+
3
+ BASIC V re-uses three BASIC II command tokens — ``&C6``/``&C7``/``&C8`` —
4
+ as two-byte *escape prefixes*: the following byte selects an extended
5
+ keyword (``SUM``, ``RENUMBER``, ``CASE``/``SYS``/``ORIGIN`` ...). It also
6
+ re-purposes several single-byte slots: ``&7F`` becomes ``OTHERWISE`` and
7
+ ``&C9``-``&CE`` — ``LIST``/``NEW``/``OLD``/``RENUMBER``/``SAVE`` in
8
+ BASIC II — become the block keywords ``WHEN``/``OF``/``ENDCASE``/
9
+ ``ELSE``/``ENDIF``/``ENDWHILE``. The default dialect stays BASIC II, so
10
+ the escape bytes still decode to ``AUTO``/``DELETE``/``LOAD`` unless the
11
+ caller opts into :data:`BASIC_V`.
12
+ """
13
+
14
+ import pytest
15
+ from oaknut.basic import (
16
+ BASIC_II,
17
+ BASIC_V,
18
+ Dialect,
19
+ TokenKind,
20
+ detokenise,
21
+ detokenise_body,
22
+ scan,
23
+ )
24
+
25
+
26
+ def _program(*lines: tuple[int, bytes]) -> bytes:
27
+ """Frame (line_number, body) pairs into a tokenised program."""
28
+ out = bytearray()
29
+ for line_number, body in lines:
30
+ out += bytes((0x0D, (line_number >> 8) & 0xFF, line_number & 0xFF, 4 + len(body)))
31
+ out += body
32
+ out += b"\x0d\xff"
33
+ return bytes(out)
34
+
35
+
36
+ class TestEscapeStatements:
37
+ """``&C8`` <byte> — extended statement tokens, second byte from &8E."""
38
+
39
+ @pytest.mark.parametrize(
40
+ ("second", "keyword"),
41
+ [
42
+ (0x8E, "CASE"),
43
+ (0x8F, "CIRCLE"),
44
+ (0x90, "FILL"),
45
+ (0x91, "ORIGIN"),
46
+ (0x92, "POINT"),
47
+ (0x93, "RECTANGLE"),
48
+ (0x94, "SWAP"),
49
+ (0x95, "WHILE"),
50
+ (0x96, "WAIT"),
51
+ (0x97, "MOUSE"),
52
+ (0x98, "QUIT"),
53
+ (0x99, "SYS"),
54
+ (0x9A, "INSTALL"),
55
+ (0x9B, "LIBRARY"),
56
+ (0x9C, "TINT"),
57
+ (0x9D, "ELLIPSE"),
58
+ (0x9E, "BEATS"),
59
+ (0x9F, "TEMPO"),
60
+ (0xA0, "VOICES"),
61
+ (0xA1, "VOICE"),
62
+ (0xA2, "STEREO"),
63
+ (0xA3, "OVERLAY"),
64
+ ],
65
+ )
66
+ def test_c8_escape_decodes_to_statement(self, second: int, keyword: str):
67
+ assert detokenise_body(bytes((0xC8, second)), dialect=BASIC_V) == keyword
68
+
69
+
70
+ class TestEscapeCommands:
71
+ """``&C7`` <byte> — extended command tokens, second byte from &8E."""
72
+
73
+ @pytest.mark.parametrize(
74
+ ("second", "keyword"),
75
+ [
76
+ (0x8E, "APPEND"),
77
+ (0x8F, "AUTO"),
78
+ (0x90, "CRUNCH"),
79
+ (0x91, "DELETE"),
80
+ (0x92, "EDIT"),
81
+ (0x93, "HELP"),
82
+ (0x94, "LIST"),
83
+ (0x95, "LOAD"),
84
+ (0x96, "LVAR"),
85
+ (0x97, "NEW"),
86
+ (0x98, "OLD"),
87
+ (0x99, "RENUMBER"),
88
+ (0x9A, "SAVE"),
89
+ (0x9B, "TEXTLOAD"),
90
+ (0x9C, "TEXTSAVE"),
91
+ (0x9D, "TWIN"),
92
+ (0x9E, "TWINO"),
93
+ (0x9F, "INSTALL"),
94
+ ],
95
+ )
96
+ def test_c7_escape_decodes_to_command(self, second: int, keyword: str):
97
+ assert detokenise_body(bytes((0xC7, second)), dialect=BASIC_V) == keyword
98
+
99
+
100
+ class TestEscapeFunctions:
101
+ """``&C6`` <byte> — the two extended function tokens."""
102
+
103
+ @pytest.mark.parametrize(("second", "keyword"), [(0x8E, "SUM"), (0x8F, "BEAT")])
104
+ def test_c6_escape_decodes_to_function(self, second: int, keyword: str):
105
+ assert detokenise_body(bytes((0xC6, second)), dialect=BASIC_V) == keyword
106
+
107
+
108
+ class TestSingleByteDifferences:
109
+ """Single-byte tokens BASIC V re-purposes from BASIC II."""
110
+
111
+ @pytest.mark.parametrize(
112
+ ("token", "keyword"),
113
+ [
114
+ (0x7F, "OTHERWISE"),
115
+ (0xC9, "WHEN"),
116
+ (0xCA, "OF"),
117
+ (0xCB, "ENDCASE"),
118
+ (0xCC, "ELSE"),
119
+ (0xCD, "ENDIF"),
120
+ (0xCE, "ENDWHILE"),
121
+ ],
122
+ )
123
+ def test_single_byte_token_decodes_to_basic_v_keyword(self, token: int, keyword: str):
124
+ assert detokenise_body(bytes((token,)), dialect=BASIC_V) == keyword
125
+
126
+ def test_shared_single_byte_tokens_are_unchanged(self):
127
+ # Tokens common to both dialects still decode the same way.
128
+ assert detokenise_body(b"\xf1", dialect=BASIC_V) == "PRINT"
129
+ assert detokenise_body(b"\x8f", dialect=BASIC_V) == "PTR"
130
+
131
+
132
+ class TestIssue44Reproduction:
133
+ def test_origin_statement_is_not_loadtime(self):
134
+ # Line 140 of "Herding (RR1)", Acorn User June 1991:
135
+ # C8 91 " 640,512" -> ORIGIN 640,512 (was: LOADTIME 640,512)
136
+ body = bytes((0xC8, 0x91)) + b" 640,512"
137
+ assert detokenise_body(body, dialect=BASIC_V) == "ORIGIN 640,512"
138
+
139
+ def test_program_round_through_detokenise(self):
140
+ program = _program((140, bytes((0xC8, 0x91)) + b" 640,512"))
141
+ assert detokenise(program, dialect=BASIC_V) == "140ORIGIN 640,512\n"
142
+
143
+
144
+ class TestDefaultDialectUnchanged:
145
+ """Without opting in, the escape bytes keep their BASIC II meaning."""
146
+
147
+ def test_c8_defaults_to_load_plus_token(self):
148
+ body = bytes((0xC8, 0x91)) + b" 640,512"
149
+ assert detokenise_body(body) == "LOADTIME 640,512"
150
+
151
+ def test_default_dialect_is_basic_ii(self):
152
+ body = bytes((0xC8, 0x91))
153
+ assert detokenise_body(body, dialect=BASIC_II) == detokenise_body(body)
154
+
155
+ def test_basic_v_c9_is_list_under_basic_ii(self):
156
+ assert detokenise_body(b"\xc9") == "LIST"
157
+
158
+
159
+ class TestScanStream:
160
+ def test_escape_token_is_one_keyword_advancing_two_bytes(self):
161
+ body = bytes((0xC8, 0x99)) + b"X" # SYS then a literal X
162
+ tokens = list(scan(body, dialect=BASIC_V))
163
+ assert tokens[0].kind is TokenKind.KEYWORD
164
+ assert tokens[0].value == "SYS"
165
+ assert tokens[0].start == 0
166
+ assert tokens[1].kind is TokenKind.TEXT
167
+ assert tokens[1].value == "X"
168
+ assert tokens[1].start == 2
169
+
170
+ def test_escape_byte_inside_string_stays_literal(self):
171
+ body = b'"' + bytes((0xC8, 0x91)) + b'"'
172
+ tokens = list(scan(body, dialect=BASIC_V))
173
+ assert len(tokens) == 1
174
+ assert tokens[0].kind is TokenKind.STRING
175
+ assert tokens[0].value == '"\xc8\x91"'
176
+
177
+ def test_unterminated_escape_prefix_stays_literal(self):
178
+ # A trailing &C8 with no following byte must not over-read.
179
+ tokens = list(scan(b"\xc8", dialect=BASIC_V))
180
+ assert tokens[0].kind is TokenKind.TEXT
181
+ assert tokens[0].value == "\xc8"
182
+
183
+
184
+ class TestDialectObject:
185
+ def test_basic_v_has_a_name(self):
186
+ assert isinstance(BASIC_V, Dialect)
187
+ assert "V" in BASIC_V.name
188
+
189
+ def test_unknown_escape_second_byte_is_left_literal(self):
190
+ # &C8 followed by a byte with no extended-token meaning: emit the
191
+ # prefix as a literal rather than inventing a keyword or over-reading.
192
+ body = bytes((0xC8, 0x41)) # &41 = 'A', not an extended statement
193
+ tokens = list(scan(body, dialect=BASIC_V))
194
+ assert tokens[0].kind is TokenKind.TEXT
195
+ assert tokens[0].value.startswith("\xc8")
@@ -9,6 +9,7 @@ rather than re-testing the numbering or tokenising logic itself.
9
9
  from pathlib import Path
10
10
 
11
11
  import oaknut.basic as basic
12
+ import pytest
12
13
  from click.testing import CliRunner
13
14
  from oaknut.basic.cli import cli
14
15
 
@@ -230,6 +231,31 @@ class TestDetokeniseCommand:
230
231
  assert result.exit_code != 0
231
232
  assert "offset 0" in result.output
232
233
 
234
+ @pytest.mark.parametrize("dialect", ["v", "5"])
235
+ def test_dialect_v_decodes_escape_tokens(self, dialect: str):
236
+ # &C8 &91 is the BASIC V ORIGIN statement, not "LOADTIME".
237
+ # The Roman "v" and Arabic "5" numerals are interchangeable.
238
+ program = b"\x0d\x00\x8c\x0e\xc8\x91 640,512\x0d\xff"
239
+ runner = CliRunner()
240
+ result = runner.invoke(cli, ["detokenise", "--dialect", dialect], input=program)
241
+ assert result.exit_code == 0
242
+ assert result.stdout_bytes == b"140ORIGIN 640,512\n"
243
+
244
+ @pytest.mark.parametrize("dialect", ["ii", "2"])
245
+ def test_dialect_ii_keeps_basic_ii_meaning(self, dialect: str):
246
+ program = b"\x0d\x00\x8c\x0e\xc8\x91 640,512\x0d\xff"
247
+ runner = CliRunner()
248
+ result = runner.invoke(cli, ["detokenise", "--dialect", dialect], input=program)
249
+ assert result.exit_code == 0
250
+ assert result.stdout_bytes == b"140LOADTIME 640,512\n"
251
+
252
+ def test_default_dialect_is_basic_ii(self):
253
+ program = b"\x0d\x00\x8c\x0e\xc8\x91 640,512\x0d\xff"
254
+ runner = CliRunner()
255
+ result = runner.invoke(cli, ["detokenise"], input=program)
256
+ assert result.exit_code == 0
257
+ assert result.stdout_bytes == b"140LOADTIME 640,512\n"
258
+
233
259
 
234
260
  class TestTokeniseDetokeniseRoundTrip:
235
261
  def test_cli_round_trip(self):
@@ -47,6 +47,13 @@ class TestNumberLines:
47
47
  def test_blank_lines_are_numbered_too(self):
48
48
  assert basic.number_lines("A\n\nB") == "10 A\n20 \n30 B"
49
49
 
50
+ def test_control_codes_in_string_literal_do_not_split_the_line(self):
51
+ # VDU / mode-7 control codes (here &0C, &1C) appear legitimately
52
+ # inside string literals; str.splitlines() would break on them,
53
+ # corrupting a single program line into several numbered lines.
54
+ source = 'PRINT "' + chr(0x0C) + chr(0x1C) + 'banner"'
55
+ assert basic.number_lines(source) == '10 PRINT "\x0c\x1cbanner"'
56
+
50
57
  def test_internal_references_are_left_untouched(self):
51
58
  # The facade only prepends numbers; it does not rewrite GOTO etc.
52
59
  source = "GOTO 20\nEND"
File without changes
File without changes
File without changes