oaknut-basic 12.8.2__tar.gz → 12.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {oaknut_basic-12.8.2/src/oaknut_basic.egg-info → oaknut_basic-12.9.0}/PKG-INFO +1 -1
  2. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/__init__.py +39 -2
  3. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/cli.py +12 -8
  4. oaknut_basic-12.9.0/src/oaknut/basic/detokeniser.py +73 -0
  5. oaknut_basic-12.9.0/src/oaknut/basic/scanner.py +192 -0
  6. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/tokens.py +8 -3
  7. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0/src/oaknut_basic.egg-info}/PKG-INFO +1 -1
  8. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/SOURCES.txt +2 -0
  9. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_cli.py +42 -2
  10. oaknut_basic-12.9.0/tests/test_scanner.py +179 -0
  11. oaknut_basic-12.8.2/src/oaknut/basic/detokeniser.py +0 -116
  12. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/LICENSE +0 -0
  13. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/README.md +0 -0
  14. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/pyproject.toml +0 -0
  15. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/setup.cfg +0 -0
  16. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/datafile.py +0 -0
  17. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/exceptions.py +0 -0
  18. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/float5.py +0 -0
  19. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/linenumber.py +0 -0
  20. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/numbering.py +0 -0
  21. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut/basic/tokeniser.py +0 -0
  22. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/dependency_links.txt +0 -0
  23. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/entry_points.txt +0 -0
  24. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/requires.txt +0 -0
  25. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/src/oaknut_basic.egg-info/top_level.txt +0 -0
  26. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_basic.py +0 -0
  27. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_crunch_rules.py +0 -0
  28. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_data_cli.py +0 -0
  29. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_datafile.py +0 -0
  30. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_detokeniser.py +0 -0
  31. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_float5.py +0 -0
  32. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_keyword_coverage.py +0 -0
  33. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_linenumber.py +0 -0
  34. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_numbering.py +0 -0
  35. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_rom_golden.py +0 -0
  36. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_rom_golden_detokenise.py +0 -0
  37. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_tokeniser.py +0 -0
  38. {oaknut_basic-12.8.2 → oaknut_basic-12.9.0}/tests/test_tokens.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oaknut-basic
3
- Version: 12.8.2
3
+ Version: 12.9.0
4
4
  Summary: BBC BASIC tools: program tokeniser/de-tokeniser and PRINT#/INPUT# data-file reader/writer
5
5
  Author-email: Robert Smallshire <robert@smallshire.org.uk>
6
6
  License-Expression: MIT
@@ -30,7 +30,7 @@ from oaknut.basic.datafile import (
30
30
  BbcBasicDataReader,
31
31
  BbcBasicDataWriter,
32
32
  )
33
- from oaknut.basic.detokeniser import detokenise
33
+ from oaknut.basic.detokeniser import detokenise, detokenise_body
34
34
  from oaknut.basic.exceptions import (
35
35
  AlreadyNumberedError,
36
36
  BASICError,
@@ -52,14 +52,34 @@ from oaknut.basic.exceptions import (
52
52
  UnnumberedLineError,
53
53
  )
54
54
  from oaknut.basic.float5 import pack_float5, unpack_float5
55
+ from oaknut.basic.linenumber import decode_line_number
55
56
  from oaknut.basic.numbering import (
56
57
  DEFAULT_LINE_NUMBER,
57
58
  DEFAULT_LINE_STEP,
58
59
  number_lines,
59
60
  )
61
+ from oaknut.basic.scanner import (
62
+ LineRecord,
63
+ Token,
64
+ TokenKind,
65
+ scan,
66
+ scan_program,
67
+ )
60
68
  from oaknut.basic.tokeniser import tokenise
69
+ from oaknut.basic.tokens import (
70
+ FLAG_CONDITIONAL,
71
+ FLAG_FN_PROC,
72
+ FLAG_LINE_NUMBER,
73
+ FLAG_MIDDLE,
74
+ FLAG_PSEUDO_VAR,
75
+ FLAG_START,
76
+ FLAG_STOP_LINE,
77
+ KEYWORDS,
78
+ LINE_NUMBER_TOKEN,
79
+ TOKEN_TO_KEYWORD,
80
+ )
61
81
 
62
- __version__ = "12.8.2"
82
+ __version__ = "12.9.0"
63
83
 
64
84
  # Canonical load addresses for BBC BASIC programs on each host.
65
85
  # Programs saved by *SAVE on a real machine use these by default.
@@ -71,6 +91,16 @@ __all__ = [
71
91
  "DEFAULT_LINE_NUMBER",
72
92
  "DEFAULT_LINE_STEP",
73
93
  "ELECTRON_BASIC_LOAD_ADDRESS",
94
+ "FLAG_CONDITIONAL",
95
+ "FLAG_FN_PROC",
96
+ "FLAG_LINE_NUMBER",
97
+ "FLAG_MIDDLE",
98
+ "FLAG_PSEUDO_VAR",
99
+ "FLAG_START",
100
+ "FLAG_STOP_LINE",
101
+ "KEYWORDS",
102
+ "LINE_NUMBER_TOKEN",
103
+ "TOKEN_TO_KEYWORD",
74
104
  "AlreadyNumberedError",
75
105
  "BASICError",
76
106
  "BbcBasicDataFile",
@@ -85,17 +115,24 @@ __all__ = [
85
115
  "InvalidLineLengthError",
86
116
  "LineNumberOrderError",
87
117
  "LineNumberRangeError",
118
+ "LineRecord",
88
119
  "LineTooLongError",
89
120
  "MissingLineMarkerError",
90
121
  "StringTooLongError",
122
+ "Token",
123
+ "TokenKind",
91
124
  "TokeniseError",
92
125
  "TruncatedProgramError",
93
126
  "TruncatedRecordError",
94
127
  "UnknownTagError",
95
128
  "UnnumberedLineError",
129
+ "decode_line_number",
96
130
  "detokenise",
131
+ "detokenise_body",
97
132
  "number_lines",
98
133
  "pack_float5",
134
+ "scan",
135
+ "scan_program",
99
136
  "unpack_float5",
100
137
  "tokenise",
101
138
  ]
@@ -190,10 +190,11 @@ def _listing_to_bytes(listing: str, encoding: str) -> bytes:
190
190
  )
191
191
  @click.option(
192
192
  "--encoding",
193
- default="acorn",
193
+ default="utf-8",
194
194
  show_default=True,
195
195
  callback=_validate_encoding,
196
- help="Text encoding of the INPUT source. Defaults to the BBC character set.",
196
+ help='Text encoding of the INPUT source. Use "acorn" for the BBC '
197
+ "character set (e.g. source taken straight off a disc image).",
197
198
  )
198
199
  @click.option(
199
200
  "--start",
@@ -226,8 +227,9 @@ def tokenise(
226
227
 
227
228
  oaknut-basic tokenise --start 10 unnumbered.bas MENU
228
229
 
229
- INPUT is read in --encoding (the BBC ``acorn`` character set by default;
230
- pass ``utf-8`` for source authored in a modern editor).
230
+ INPUT is read in --encoding (``utf-8`` by default, for source authored
231
+ in a modern editor; pass ``acorn`` for the BBC character set, e.g.
232
+ source taken straight off a disc image).
231
233
  """
232
234
  from oaknut.basic import tokenise as tokenise_source
233
235
 
@@ -252,10 +254,11 @@ def tokenise(
252
254
  )
253
255
  @click.option(
254
256
  "--encoding",
255
- default="acorn",
257
+ default="utf-8",
256
258
  show_default=True,
257
259
  callback=_validate_encoding,
258
- help="Text encoding for the OUTPUT source. Defaults to the BBC character set.",
260
+ help='Text encoding for the OUTPUT source. Use "acorn" for the BBC '
261
+ "character set with CR line endings (e.g. writing back to a disc image).",
259
262
  )
260
263
  def detokenise(input_stream, output_stream, encoding: str) -> None:
261
264
  """De-tokenise a stored BBC BASIC program into source text.
@@ -267,8 +270,9 @@ def detokenise(input_stream, output_stream, encoding: str) -> None:
267
270
  oaknut-basic detokenise MENU menu.bas
268
271
  disc get game.ssd MENU - | oaknut-basic detokenise
269
272
 
270
- OUTPUT is written in --encoding (the BBC ``acorn`` character set with CR
271
- line endings by default; pass ``utf-8`` for a host text file).
273
+ OUTPUT is written in --encoding (``utf-8`` with host-native ``LF`` line
274
+ endings by default, for a host text file; pass ``acorn`` for the BBC
275
+ character set with ``CR`` endings, e.g. writing back to a disc image).
272
276
  """
273
277
  from oaknut.basic import detokenise as detokenise_program
274
278
 
@@ -0,0 +1,73 @@
1
+ """De-tokenise BBC BASIC II bytes into source text.
2
+
3
+ The de-tokeniser is the *text projection* of the token scanner
4
+ (:mod:`oaknut.basic.scanner`): :func:`detokenise` mirrors the ROM's
5
+ ``LIST`` walk over the ``&0D``-framed stored form — rendering each line's
6
+ number followed by its body — while :func:`detokenise_body` flattens one
7
+ unframed line body. Both join the body :class:`~oaknut.basic.Token`
8
+ stream's values: keyword tokens expand to their table spelling, ``&8D``
9
+ references decode back to plain decimal, and bytes inside a quoted string
10
+ are emitted verbatim (no token expansion). Where keyword *adjacency*
11
+ matters — ``?QPLOT`` is not re-typeable as a program — reach past the
12
+ text for :func:`oaknut.basic.scan` instead.
13
+
14
+ The returned string uses latin-1/code-point semantics: a stored byte
15
+ ``b`` becomes ``chr(b)``, so the text round-trips byte-exactly through
16
+ :func:`oaknut.basic.tokenise`. Mapping to a host character set (Acorn,
17
+ UTF-8) is the caller's job — the CLI does it at its I/O boundary.
18
+
19
+ The line layout (from the disassembly) is::
20
+
21
+ &0D <lineNo-hi> <lineNo-lo> <length> <body...> ... &0D &FF
22
+
23
+ where *length* counts the whole record from this ``&0D`` up to (not
24
+ including) the next one, so ``length = 4 + len(body)``.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from oaknut.basic.scanner import scan, scan_program
30
+
31
+
32
+ def detokenise(data: bytes) -> str:
33
+ """De-tokenise a BBC BASIC II program into source text.
34
+
35
+ Args:
36
+ data: A tokenised program, as stored on disc or in memory,
37
+ terminated by the ``&0D &FF`` end marker.
38
+
39
+ Returns:
40
+ The program as numbered source text, lines separated by ``"\\n"``
41
+ with a trailing newline. Empty for an empty program.
42
+
43
+ Raises:
44
+ DetokeniseError: The token stream is malformed — a missing line
45
+ marker, a truncated header or reference, or an impossible
46
+ length byte.
47
+ """
48
+ return "".join(
49
+ f"{record.line_number}{''.join(token.value for token in record.tokens)}\n"
50
+ for record in scan_program(data)
51
+ )
52
+
53
+
54
+ def detokenise_body(data: bytes) -> str:
55
+ """De-tokenise an unframed line body into text.
56
+
57
+ Unlike :func:`detokenise`, this expects a line *body* — inline token
58
+ bytes with no ``&0D`` framing and no leading line number, the form
59
+ BBC Micro Bot / Owlet programs and AUTO-style entry arrive in. It is
60
+ the text join of :func:`oaknut.basic.scan`; when keyword adjacency
61
+ matters, consume that token stream directly instead.
62
+
63
+ Args:
64
+ data: A line body of inline token bytes.
65
+
66
+ Returns:
67
+ The body as source text, using latin-1/code-point semantics so it
68
+ round-trips byte-exactly through :func:`oaknut.basic.tokenise`.
69
+
70
+ Raises:
71
+ DetokeniseError: A ``&8D`` line-number reference is truncated.
72
+ """
73
+ return "".join(token.value for token in scan(data))
@@ -0,0 +1,192 @@
1
+ """Scan tokenised BBC BASIC II bytes into a position-aware token stream.
2
+
3
+ De-tokenising to text is lossy for keyword adjacency: the body
4
+ ``?Q <PLOT> ?Q`` renders as the text ``?QPLOT?Q``, which the ROM
5
+ tokeniser re-reads as ``?`` followed by the *variable* ``QPLOT`` — the
6
+ embedded ``PLOT`` keyword is gone (after a non-keyword letter the crunch
7
+ consumes the whole identifier; see :func:`oaknut.basic.tokenise`). The
8
+ only faithful representation of a tokenised program is therefore the
9
+ *bytes*, where ``PLOT`` is a distinct token. :func:`scan` yields that
10
+ stream: keyword tokens resolved to their spelling, ``&8D`` line-number
11
+ references decoded, quoted strings preserved verbatim, and every other
12
+ byte coalesced into literal :attr:`TokenKind.TEXT` runs for the consumer
13
+ to sub-lex.
14
+
15
+ The walk mirrors the de-tokeniser's; in fact the de-tokeniser is one
16
+ projection of this stream — joining every token's :attr:`Token.value`
17
+ reproduces a body's text exactly, which is how
18
+ :func:`oaknut.basic.detokenise_body` is defined. :func:`scan` works on a
19
+ line *body* (inline tokens, no framing); :func:`scan_program` walks the
20
+ ``&0D``-framed stored form and yields a :class:`LineRecord` per line.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import enum
26
+ from collections.abc import Iterator
27
+ from dataclasses import dataclass
28
+
29
+ from oaknut.basic.exceptions import (
30
+ InvalidLineLengthError,
31
+ MissingLineMarkerError,
32
+ TruncatedProgramError,
33
+ )
34
+ from oaknut.basic.linenumber import decode_line_number
35
+ from oaknut.basic.tokens import HEADER_LENGTH, LINE_NUMBER_TOKEN, TOKEN_TO_KEYWORD
36
+
37
+ _CR = 0x0D
38
+ _END_MARKER = 0xFF
39
+ _QUOTE = 0x22
40
+
41
+
42
+ class TokenKind(enum.Enum):
43
+ """The kind of a scanned :class:`Token`."""
44
+
45
+ #: A keyword token byte, resolved to its spelling via the token table.
46
+ KEYWORD = "KEYWORD"
47
+ #: A run of literal bytes — identifiers, numbers, operators,
48
+ #: punctuation, spaces, and any unknown high byte — for the consumer
49
+ #: to sub-lex. Each byte ``b`` appears as ``chr(b)``.
50
+ TEXT = "TEXT"
51
+ #: A ``"..."`` string literal, copied verbatim through the closing
52
+ #: quote (or the end of the body, if unterminated).
53
+ STRING = "STRING"
54
+ #: A ``&8D`` three-byte line-number reference, decoded to decimal.
55
+ LINENUM = "LINENUM"
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class Token:
60
+ """One unit of a scanned BBC BASIC line body.
61
+
62
+ Attributes:
63
+ kind: Which :class:`TokenKind` this is.
64
+ value: The keyword spelling (``KEYWORD``), the literal run
65
+ (``TEXT``), the quoted string including its quotes
66
+ (``STRING``), or the decimal line number (``LINENUM``).
67
+ start: Byte offset of the token within the scanned data, shifted
68
+ by ``base_offset``.
69
+ token: The token byte, for a ``KEYWORD``; ``None`` otherwise.
70
+ """
71
+
72
+ kind: TokenKind
73
+ value: str
74
+ start: int
75
+ token: int | None = None
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class LineRecord:
80
+ """One line of a scanned stored program.
81
+
82
+ Attributes:
83
+ line_number: The line's number.
84
+ tokens: The line body as a tuple of :class:`Token`.
85
+ start: Byte offset of the line's ``&0D`` marker.
86
+ """
87
+
88
+ line_number: int
89
+ tokens: tuple[Token, ...]
90
+ start: int
91
+
92
+
93
+ def scan(data: bytes, *, base_offset: int = 0) -> Iterator[Token]:
94
+ """Scan an unframed line body into a stream of :class:`Token`.
95
+
96
+ Args:
97
+ data: A line *body* — inline token bytes with no ``&0D`` line
98
+ framing and no leading line number.
99
+ base_offset: Added to every token's :attr:`~Token.start`, so
100
+ offsets can be reported against a larger buffer (for example
101
+ the line body's position within a whole program).
102
+
103
+ Yields:
104
+ One :class:`Token` per keyword, literal run, string literal, and
105
+ ``&8D`` line-number reference, in source order.
106
+
107
+ Raises:
108
+ DetokeniseError: A ``&8D`` reference is truncated.
109
+ """
110
+ n = len(data)
111
+ i = 0
112
+ text_start = -1
113
+ text_chars: list[str] = []
114
+
115
+ def flush() -> Iterator[Token]:
116
+ nonlocal text_chars, text_start
117
+ if text_chars:
118
+ yield Token(TokenKind.TEXT, "".join(text_chars), base_offset + text_start)
119
+ text_chars = []
120
+ text_start = -1
121
+
122
+ while i < n:
123
+ b = data[i]
124
+ if b == _QUOTE:
125
+ yield from flush()
126
+ j = i + 1
127
+ while j < n and data[j] != _QUOTE:
128
+ j += 1
129
+ end = j + 1 if j < n else n
130
+ yield Token(TokenKind.STRING, "".join(chr(c) for c in data[i:end]), base_offset + i)
131
+ i = end
132
+ continue
133
+ if b == LINE_NUMBER_TOKEN:
134
+ yield from flush()
135
+ payload = data[i + 1 : i + 4]
136
+ if len(payload) != 3:
137
+ raise TruncatedProgramError(
138
+ offset=base_offset + i,
139
+ detail="incomplete &8D line-number reference",
140
+ )
141
+ yield Token(TokenKind.LINENUM, str(decode_line_number(payload)), base_offset + i)
142
+ i += 4
143
+ continue
144
+ if b >= 0x80 and b in TOKEN_TO_KEYWORD:
145
+ yield from flush()
146
+ yield Token(TokenKind.KEYWORD, TOKEN_TO_KEYWORD[b], base_offset + i, b)
147
+ i += 1
148
+ continue
149
+ # A literal byte (< &80) or an unknown high byte (e.g. the &CE
150
+ # gap): coalesce into a TEXT run so the byte still round-trips.
151
+ if text_start < 0:
152
+ text_start = i
153
+ text_chars.append(chr(b))
154
+ i += 1
155
+ yield from flush()
156
+
157
+
158
+ def scan_program(data: bytes) -> Iterator[LineRecord]:
159
+ """Walk a ``&0D``-framed stored program, scanning each line body.
160
+
161
+ Args:
162
+ data: A tokenised program as stored on disc or in memory,
163
+ terminated by the ``&0D &FF`` end marker.
164
+
165
+ Yields:
166
+ One :class:`LineRecord` per line, in storage order, each carrying
167
+ the line number and its body :class:`Token` stream.
168
+
169
+ Raises:
170
+ DetokeniseError: The framing is malformed — a missing line
171
+ marker, a truncated header or reference, or an impossible
172
+ length byte.
173
+ """
174
+ n = len(data)
175
+ i = 0
176
+ while i < n:
177
+ if data[i] != _CR:
178
+ raise MissingLineMarkerError(offset=i, found=data[i])
179
+ if i + 1 >= n:
180
+ raise TruncatedProgramError(offset=i, detail="line marker with no line number")
181
+ if data[i + 1] == _END_MARKER:
182
+ break
183
+ if i + HEADER_LENGTH > n:
184
+ raise TruncatedProgramError(offset=i, detail="incomplete line header")
185
+ line_number = (data[i + 1] << 8) | data[i + 2]
186
+ length = data[i + 3]
187
+ if length < HEADER_LENGTH or i + length > n:
188
+ raise InvalidLineLengthError(offset=i, length=length)
189
+ body = data[i + HEADER_LENGTH : i + length]
190
+ tokens = tuple(scan(body, base_offset=i + HEADER_LENGTH))
191
+ yield LineRecord(line_number=line_number, tokens=tokens, start=i)
192
+ i += length
@@ -25,6 +25,8 @@ Two structural facts drive the tables below:
25
25
 
26
26
  from __future__ import annotations
27
27
 
28
+ from types import MappingProxyType
29
+
28
30
  # The line-number reference token. Not a table entry: the tokeniser
29
31
  # produces ``&8D`` followed by a 3-byte encoded line number, and the
30
32
  # de-tokeniser special-cases it (see :mod:`oaknut.basic.linenumber`).
@@ -192,6 +194,9 @@ _ASSIGNMENT_FORMS: tuple[tuple[str, int], ...] = (
192
194
  )
193
195
 
194
196
  # token byte -> keyword spelling, covering every entry in the ROM table.
195
- # &8D (line number) and &CE (unused gap) are deliberately absent.
196
- TOKEN_TO_KEYWORD: dict[int, str] = {token: keyword for keyword, token, _flags in KEYWORDS}
197
- TOKEN_TO_KEYWORD.update({token: keyword for keyword, token in _ASSIGNMENT_FORMS})
197
+ # &8D (line number) and &CE (unused gap) are deliberately absent. Exposed
198
+ # read-only: it is public API a byte-level consumer reads but must not
199
+ # mutate (a stray write would corrupt the tokeniser and de-tokeniser).
200
+ _token_to_keyword: dict[int, str] = {token: keyword for keyword, token, _flags in KEYWORDS}
201
+ _token_to_keyword.update({token: keyword for keyword, token in _ASSIGNMENT_FORMS})
202
+ TOKEN_TO_KEYWORD: MappingProxyType[int, str] = MappingProxyType(_token_to_keyword)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oaknut-basic
3
- Version: 12.8.2
3
+ Version: 12.9.0
4
4
  Summary: BBC BASIC tools: program tokeniser/de-tokeniser and PRINT#/INPUT# data-file reader/writer
5
5
  Author-email: Robert Smallshire <robert@smallshire.org.uk>
6
6
  License-Expression: MIT
@@ -9,6 +9,7 @@ src/oaknut/basic/exceptions.py
9
9
  src/oaknut/basic/float5.py
10
10
  src/oaknut/basic/linenumber.py
11
11
  src/oaknut/basic/numbering.py
12
+ src/oaknut/basic/scanner.py
12
13
  src/oaknut/basic/tokeniser.py
13
14
  src/oaknut/basic/tokens.py
14
15
  src/oaknut_basic.egg-info/PKG-INFO
@@ -29,5 +30,6 @@ tests/test_linenumber.py
29
30
  tests/test_numbering.py
30
31
  tests/test_rom_golden.py
31
32
  tests/test_rom_golden_detokenise.py
33
+ tests/test_scanner.py
32
34
  tests/test_tokeniser.py
33
35
  tests/test_tokens.py
@@ -176,15 +176,54 @@ class TestTokeniseCommand:
176
176
  assert "no line number" in result.output
177
177
 
178
178
 
179
+ class TestTokeniseEncoding:
180
+ def test_default_decodes_host_utf8_pound_sign(self):
181
+ # The default encoding is UTF-8, so a £ typed in a host editor
182
+ # (0xC2 0xA3) maps to the single Acorn code point, not the raw
183
+ # multibyte sequence.
184
+ runner = CliRunner()
185
+ result = runner.invoke(cli, ["tokenise"], input='10 PRINT "£"\n'.encode("utf-8"))
186
+ assert result.exit_code == 0
187
+ # Acorn stores £ as 0x60 inside the tokenised string literal.
188
+ assert b'"\x60"' in result.stdout_bytes
189
+
190
+ def test_acorn_encoding_passes_raw_bytes_through(self):
191
+ # --encoding acorn treats the input as Acorn bytes already, so a
192
+ # 0x60 byte is the £ literal verbatim.
193
+ runner = CliRunner()
194
+ result = runner.invoke(
195
+ cli, ["tokenise", "--encoding", "acorn"], input=b'10 PRINT "\x60"\n'
196
+ )
197
+ assert result.exit_code == 0
198
+ assert b'"\x60"' in result.stdout_bytes
199
+
200
+
179
201
  class TestDetokeniseCommand:
180
202
  def test_pipe_detokenises_to_stdout(self):
181
203
  program = basic.tokenise("10 PRINT")
182
204
  runner = CliRunner()
183
205
  result = runner.invoke(cli, ["detokenise"], input=program)
184
206
  assert result.exit_code == 0
185
- # Acorn output: CR line ending.
207
+ # Default UTF-8 output: host-native LF line ending.
208
+ assert result.stdout_bytes == b"10 PRINT\n"
209
+
210
+ def test_acorn_encoding_uses_cr_terminators(self):
211
+ program = basic.tokenise("10 PRINT")
212
+ runner = CliRunner()
213
+ result = runner.invoke(cli, ["detokenise", "--encoding", "acorn"], input=program)
214
+ assert result.exit_code == 0
215
+ # Acorn output: native CR line ending.
186
216
  assert result.stdout_bytes == b"10 PRINT\r"
187
217
 
218
+ def test_default_writes_host_utf8_pound_sign(self):
219
+ # A program whose string literal holds the Acorn £ (0x60)
220
+ # de-tokenises to a UTF-8 £ under the default encoding.
221
+ program = basic.tokenise('10 PRINT "£"')
222
+ runner = CliRunner()
223
+ result = runner.invoke(cli, ["detokenise"], input=program)
224
+ assert result.exit_code == 0
225
+ assert "£".encode("utf-8") in result.stdout_bytes
226
+
188
227
  def test_malformed_program_reports_the_offset(self):
189
228
  runner = CliRunner()
190
229
  result = runner.invoke(cli, ["detokenise"], input=b"\x99")
@@ -199,4 +238,5 @@ class TestTokeniseDetokeniseRoundTrip:
199
238
  assert tokenised.exit_code == 0
200
239
  listing = runner.invoke(cli, ["detokenise"], input=tokenised.stdout_bytes)
201
240
  assert listing.exit_code == 0
202
- assert listing.stdout_bytes == b"10 PRINT\r20 GOTO 10\r"
241
+ # Default UTF-8 output: host-native LF line endings.
242
+ assert listing.stdout_bytes == b"10 PRINT\n20 GOTO 10\n"
@@ -0,0 +1,179 @@
1
+ """Tests for the BBC BASIC II token-stream scanner.
2
+
3
+ The scanner is the faithful primitive: where the de-tokeniser flattens a
4
+ body to text (lossy for keyword adjacency), :func:`scan` yields a token
5
+ per keyword, literal run, string, and line-number reference, each with
6
+ its byte offset. The headline case is ``?Q <PLOT> ?Q``, whose text
7
+ ``?QPLOT?Q`` re-lexes as the variable ``QPLOT`` — only the stream keeps
8
+ ``PLOT`` as a distinct token.
9
+ """
10
+
11
+ import pytest
12
+ from oaknut.basic import (
13
+ LineRecord,
14
+ Token,
15
+ TokenKind,
16
+ detokenise_body,
17
+ scan,
18
+ scan_program,
19
+ )
20
+ from oaknut.basic.exceptions import (
21
+ MissingLineMarkerError,
22
+ TruncatedProgramError,
23
+ )
24
+ from oaknut.basic.linenumber import encode_line_number
25
+ from oaknut.basic.tokens import LINE_NUMBER_TOKEN
26
+
27
+
28
+ def _kinds(tokens):
29
+ return [token.kind for token in tokens]
30
+
31
+
32
+ def _values(tokens):
33
+ return [token.value for token in tokens]
34
+
35
+
36
+ class TestBodyScan:
37
+ def test_empty_body_yields_no_tokens(self):
38
+ assert list(scan(b"")) == []
39
+
40
+ def test_plain_text_is_one_run(self):
41
+ tokens = list(scan(b"A=1"))
42
+ assert tokens == [Token(TokenKind.TEXT, "A=1", 0)]
43
+
44
+ def test_keyword_resolves_to_spelling(self):
45
+ # &F1 is PRINT.
46
+ tokens = list(scan(b"\xf1"))
47
+ assert tokens == [Token(TokenKind.KEYWORD, "PRINT", 0, 0xF1)]
48
+
49
+ def test_keyword_splits_surrounding_text(self):
50
+ # The rheolism case: ?Q <PLOT=&F0> ?Q must keep PLOT distinct.
51
+ tokens = list(scan(b"?Q\xf0?Q"))
52
+ assert _kinds(tokens) == [TokenKind.TEXT, TokenKind.KEYWORD, TokenKind.TEXT]
53
+ assert _values(tokens) == ["?Q", "PLOT", "?Q"]
54
+ assert tokens[1].token == 0xF0
55
+ assert [token.start for token in tokens] == [0, 2, 3]
56
+
57
+ def test_string_is_copied_verbatim_through_the_quotes(self):
58
+ # Token-valued bytes inside a string must not expand.
59
+ tokens = list(scan(b'"\xf0"'))
60
+ assert tokens == [Token(TokenKind.STRING, '"\xf0"', 0)]
61
+
62
+ def test_unterminated_string_runs_to_end(self):
63
+ tokens = list(scan(b'X"ab'))
64
+ assert _kinds(tokens) == [TokenKind.TEXT, TokenKind.STRING]
65
+ assert _values(tokens) == ["X", '"ab']
66
+
67
+ def test_line_number_reference_is_decoded(self):
68
+ body = b"\xe5" + bytes((LINE_NUMBER_TOKEN,)) + encode_line_number(100)
69
+ tokens = list(scan(body))
70
+ assert _kinds(tokens) == [TokenKind.KEYWORD, TokenKind.LINENUM]
71
+ assert tokens[0].value == "GOTO"
72
+ assert tokens[1] == Token(TokenKind.LINENUM, "100", 1)
73
+
74
+ def test_unknown_high_byte_folds_into_text(self):
75
+ # &CE is the unused gap: no keyword, but must survive as a byte.
76
+ tokens = list(scan(b"A\xceB"))
77
+ assert tokens == [Token(TokenKind.TEXT, "A\xceB", 0)]
78
+
79
+ def test_base_offset_shifts_reported_positions(self):
80
+ tokens = list(scan(b"\xf0", base_offset=10))
81
+ assert tokens[0].start == 10
82
+
83
+ def test_truncated_line_number_reference_raises(self):
84
+ with pytest.raises(TruncatedProgramError):
85
+ list(scan(bytes((LINE_NUMBER_TOKEN, 0x40, 0x40))))
86
+
87
+
88
+ class TestDetokeniseBodyProjection:
89
+ """detokenise_body is exactly the text join of the token stream."""
90
+
91
+ @pytest.mark.parametrize(
92
+ "body",
93
+ [
94
+ b"",
95
+ b"A=1",
96
+ b"?Q\xf0?Q",
97
+ b'PRINT"\xf0\xf1"',
98
+ b"\xe5" + bytes((LINE_NUMBER_TOKEN,)) + encode_line_number(2000),
99
+ b"A\xceB",
100
+ b'X"ab',
101
+ ],
102
+ )
103
+ def test_body_text_is_the_joined_values(self, body):
104
+ assert detokenise_body(body) == "".join(token.value for token in scan(body))
105
+
106
+ def test_inline_token_body_detokenises_without_framing(self):
107
+ # The #39 case: a headerless inline-token body the framed
108
+ # detokenise() would reject at offset 0.
109
+ assert detokenise_body(b"\xeb\x37") == "MODE7"
110
+
111
+
112
+ class TestProgramScan:
113
+ def _program(self, *lines):
114
+ out = bytearray()
115
+ for line_number, body in lines:
116
+ out += bytes((0x0D, (line_number >> 8) & 0xFF, line_number & 0xFF, 4 + len(body)))
117
+ out += body
118
+ out += b"\x0d\xff"
119
+ return bytes(out)
120
+
121
+ def test_walks_records_and_yields_line_numbers(self):
122
+ program = self._program((10, b"\xeb\x37"), (20, b"?Q\xf0?Q"))
123
+ records = list(scan_program(program))
124
+ assert [record.line_number for record in records] == [10, 20]
125
+ assert all(isinstance(record, LineRecord) for record in records)
126
+ assert _values(records[1].tokens) == ["?Q", "PLOT", "?Q"]
127
+
128
+ def test_record_start_points_at_the_cr_marker(self):
129
+ program = self._program((10, b"X"), (20, b"Y"))
130
+ records = list(scan_program(program))
131
+ assert records[0].start == 0
132
+ assert records[1].start == 5
133
+
134
+ def test_empty_program_yields_no_records(self):
135
+ assert list(scan_program(b"\x0d\xff")) == []
136
+
137
+ def test_missing_line_marker_raises(self):
138
+ with pytest.raises(MissingLineMarkerError):
139
+ list(scan_program(b"\xeb"))
140
+
141
+
142
+ class TestPublicTokenTable:
143
+ """The byte-level surface a faithful front end consumes (#41)."""
144
+
145
+ def test_table_symbols_are_exported_from_the_package(self):
146
+ import oaknut.basic as basic
147
+
148
+ for name in (
149
+ "TOKEN_TO_KEYWORD",
150
+ "KEYWORDS",
151
+ "LINE_NUMBER_TOKEN",
152
+ "decode_line_number",
153
+ "FLAG_CONDITIONAL",
154
+ "FLAG_MIDDLE",
155
+ "FLAG_START",
156
+ "FLAG_FN_PROC",
157
+ "FLAG_LINE_NUMBER",
158
+ "FLAG_STOP_LINE",
159
+ "FLAG_PSEUDO_VAR",
160
+ ):
161
+ assert name in basic.__all__
162
+ assert hasattr(basic, name)
163
+
164
+ def test_token_to_keyword_resolves_a_keyword(self):
165
+ from oaknut.basic import TOKEN_TO_KEYWORD
166
+
167
+ assert TOKEN_TO_KEYWORD[0xF1] == "PRINT"
168
+
169
+ def test_token_to_keyword_is_read_only(self):
170
+ from oaknut.basic import TOKEN_TO_KEYWORD
171
+
172
+ with pytest.raises(TypeError):
173
+ TOKEN_TO_KEYWORD[0x00] = "OOPS"
174
+
175
+ def test_line_number_helpers_round_trip(self):
176
+ from oaknut.basic import LINE_NUMBER_TOKEN, decode_line_number
177
+
178
+ assert LINE_NUMBER_TOKEN == 0x8D
179
+ assert decode_line_number(encode_line_number(1234)) == 1234
@@ -1,116 +0,0 @@
1
- """De-tokenise a stored BBC BASIC II program into source text.
2
-
3
- Mirrors the ROM's ``LIST`` walk: split the program into lines on the
4
- ``&0D`` start-of-line markers, render each line's number followed by its
5
- de-tokenised body. Within a body, keyword tokens expand to their table
6
- spelling, ``&8D`` references decode back to plain decimal, and bytes
7
- inside a quoted string are emitted verbatim (no token expansion).
8
-
9
- The returned string uses latin-1/code-point semantics: a stored byte
10
- ``b`` becomes ``chr(b)``, so the text round-trips byte-exactly through
11
- :func:`oaknut.basic.tokenise`. Mapping to a host character set (Acorn,
12
- UTF-8) is the caller's job — the CLI does it at its I/O boundary.
13
-
14
- The line layout (from the disassembly) is::
15
-
16
- &0D <lineNo-hi> <lineNo-lo> <length> <body...> ... &0D &FF
17
-
18
- where *length* counts the whole record from this ``&0D`` up to (not
19
- including) the next one, so ``length = 4 + len(body)``.
20
- """
21
-
22
- from __future__ import annotations
23
-
24
- from oaknut.basic.exceptions import (
25
- InvalidLineLengthError,
26
- MissingLineMarkerError,
27
- TruncatedProgramError,
28
- )
29
- from oaknut.basic.linenumber import decode_line_number
30
- from oaknut.basic.tokens import HEADER_LENGTH, LINE_NUMBER_TOKEN, TOKEN_TO_KEYWORD
31
-
32
- _CR = 0x0D
33
- _END_MARKER = 0xFF
34
- _QUOTE = 0x22
35
-
36
-
37
- def detokenise(data: bytes) -> str:
38
- """De-tokenise a BBC BASIC II program into source text.
39
-
40
- Args:
41
- data: A tokenised program, as stored on disc or in memory,
42
- terminated by the ``&0D &FF`` end marker.
43
-
44
- Returns:
45
- The program as numbered source text, lines separated by ``"\\n"``
46
- with a trailing newline. Empty for an empty program.
47
-
48
- Raises:
49
- DetokeniseError: The token stream is malformed — a missing line
50
- marker, a truncated header or reference, or an impossible
51
- length byte.
52
- """
53
- lines: list[str] = []
54
- i = 0
55
- n = len(data)
56
- while i < n:
57
- if data[i] != _CR:
58
- raise MissingLineMarkerError(offset=i, found=data[i])
59
- if i + 1 >= n:
60
- raise TruncatedProgramError(offset=i, detail="line marker with no line number")
61
- if data[i + 1] == _END_MARKER:
62
- break
63
- if i + HEADER_LENGTH > n:
64
- raise TruncatedProgramError(offset=i, detail="incomplete line header")
65
- line_number = (data[i + 1] << 8) | data[i + 2]
66
- length = data[i + 3]
67
- if length < HEADER_LENGTH or i + length > n:
68
- raise InvalidLineLengthError(offset=i, length=length)
69
- body = data[i + HEADER_LENGTH : i + length]
70
- lines.append(f"{line_number}{_detokenise_body(body, base_offset=i + HEADER_LENGTH)}")
71
- i += length
72
- return "".join(f"{line}\n" for line in lines)
73
-
74
-
75
- def _detokenise_body(body: bytes, *, base_offset: int) -> str:
76
- """Expand one line's body bytes to text.
77
-
78
- *base_offset* is the absolute offset of ``body[0]`` within the whole
79
- program, so any fault can be reported at its true location.
80
- """
81
- out: list[str] = []
82
- i = 0
83
- n = len(body)
84
- in_string = False
85
- while i < n:
86
- b = body[i]
87
- if in_string:
88
- out.append(chr(b))
89
- if b == _QUOTE:
90
- in_string = False
91
- i += 1
92
- continue
93
- if b == _QUOTE:
94
- in_string = True
95
- out.append('"')
96
- i += 1
97
- continue
98
- if b == LINE_NUMBER_TOKEN:
99
- payload = body[i + 1 : i + 4]
100
- if len(payload) != 3:
101
- raise TruncatedProgramError(
102
- offset=base_offset + i,
103
- detail="incomplete &8D line-number reference",
104
- )
105
- out.append(str(decode_line_number(payload)))
106
- i += 4
107
- continue
108
- if b >= 0x80:
109
- # Unknown high bytes (e.g. the &CE gap) fall back to a raw
110
- # character so the bytes still round-trip.
111
- out.append(TOKEN_TO_KEYWORD.get(b, chr(b)))
112
- i += 1
113
- continue
114
- out.append(chr(b))
115
- i += 1
116
- return "".join(out)
File without changes
File without changes
File without changes