generaltranslation-icu-messageformat-parser 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- generaltranslation_icu_messageformat_parser-0.0.0/LICENSE.md +123 -0
- generaltranslation_icu_messageformat_parser-0.0.0/PKG-INFO +155 -0
- generaltranslation_icu_messageformat_parser-0.0.0/README.md +144 -0
- generaltranslation_icu_messageformat_parser-0.0.0/pyproject.toml +16 -0
- generaltranslation_icu_messageformat_parser-0.0.0/src/generaltranslation_icu_messageformat_parser/__init__.py +13 -0
- generaltranslation_icu_messageformat_parser-0.0.0/src/generaltranslation_icu_messageformat_parser/_constants.py +38 -0
- generaltranslation_icu_messageformat_parser-0.0.0/src/generaltranslation_icu_messageformat_parser/_parser.py +686 -0
- generaltranslation_icu_messageformat_parser-0.0.0/src/generaltranslation_icu_messageformat_parser/_printer.py +168 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Functional Source License, Version 1.1, ALv2 Future License
|
|
2
|
+
|
|
3
|
+
## Abbreviation
|
|
4
|
+
|
|
5
|
+
FSL-1.1-ALv2
|
|
6
|
+
|
|
7
|
+
## Notice
|
|
8
|
+
|
|
9
|
+
Copyright 2026 General Translation, Inc.
|
|
10
|
+
|
|
11
|
+
## Terms and Conditions
|
|
12
|
+
|
|
13
|
+
### Licensor ("We")
|
|
14
|
+
|
|
15
|
+
The party offering the Software under these Terms and Conditions.
|
|
16
|
+
|
|
17
|
+
### The Software
|
|
18
|
+
|
|
19
|
+
The "Software" is each version of the software that we make available under
|
|
20
|
+
these Terms and Conditions, as indicated by our inclusion of these Terms and
|
|
21
|
+
Conditions with the Software.
|
|
22
|
+
|
|
23
|
+
### License Grant
|
|
24
|
+
|
|
25
|
+
Subject to your compliance with this License Grant and the Patents,
|
|
26
|
+
Redistribution and Trademark clauses below, we hereby grant you the right to
|
|
27
|
+
use, copy, modify, create derivative works, publicly perform, publicly display
|
|
28
|
+
and redistribute the Software for any Permitted Purpose identified below.
|
|
29
|
+
|
|
30
|
+
### Permitted Purpose
|
|
31
|
+
|
|
32
|
+
A Permitted Purpose is any purpose other than a Competing Use. A Competing Use
|
|
33
|
+
means making the Software available to others in a commercial product or
|
|
34
|
+
service that:
|
|
35
|
+
|
|
36
|
+
1. substitutes for the Software;
|
|
37
|
+
|
|
38
|
+
2. substitutes for any other product or service we offer using the Software
|
|
39
|
+
that exists as of the date we make the Software available; or
|
|
40
|
+
|
|
41
|
+
3. offers the same or substantially similar functionality as the Software.
|
|
42
|
+
|
|
43
|
+
Permitted Purposes specifically include using the Software:
|
|
44
|
+
|
|
45
|
+
1. for your internal use and access;
|
|
46
|
+
|
|
47
|
+
2. for non-commercial education;
|
|
48
|
+
|
|
49
|
+
3. for non-commercial research; and
|
|
50
|
+
|
|
51
|
+
4. in connection with professional services that you provide to a licensee
|
|
52
|
+
using the Software in accordance with these Terms and Conditions.
|
|
53
|
+
|
|
54
|
+
### Patents
|
|
55
|
+
|
|
56
|
+
To the extent your use for a Permitted Purpose would necessarily infringe our
|
|
57
|
+
patents, the license grant above includes a license under our patents. If you
|
|
58
|
+
make a claim against any party that the Software infringes or contributes to
|
|
59
|
+
the infringement of any patent, then your patent license to the Software ends
|
|
60
|
+
immediately.
|
|
61
|
+
|
|
62
|
+
### Redistribution
|
|
63
|
+
|
|
64
|
+
The Terms and Conditions apply to all copies, modifications and derivatives of
|
|
65
|
+
the Software.
|
|
66
|
+
|
|
67
|
+
If you redistribute any copies, modifications or derivatives of the Software,
|
|
68
|
+
you must include a copy of or a link to these Terms and Conditions and not
|
|
69
|
+
remove any copyright notices provided in or with the Software.
|
|
70
|
+
|
|
71
|
+
### Disclaimer
|
|
72
|
+
|
|
73
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTIES OF ANY KIND, EXPRESS OR
|
|
74
|
+
IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF FITNESS FOR A PARTICULAR
|
|
75
|
+
PURPOSE, MERCHANTABILITY, TITLE OR NON-INFRINGEMENT.
|
|
76
|
+
|
|
77
|
+
IN NO EVENT WILL WE HAVE ANY LIABILITY TO YOU ARISING OUT OF OR RELATED TO THE
|
|
78
|
+
SOFTWARE, INCLUDING INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
|
|
79
|
+
EVEN IF WE HAVE BEEN INFORMED OF THEIR POSSIBILITY IN ADVANCE.
|
|
80
|
+
|
|
81
|
+
### Trademarks
|
|
82
|
+
|
|
83
|
+
Except for displaying the License Details and identifying us as the origin of
|
|
84
|
+
the Software, you have no right under these Terms and Conditions to use our
|
|
85
|
+
trademarks, trade names, service marks or product names.
|
|
86
|
+
|
|
87
|
+
## Grant of Future License
|
|
88
|
+
|
|
89
|
+
We hereby irrevocably grant you an additional license to use the Software under
|
|
90
|
+
the Apache License, Version 2.0 that is effective on the second anniversary of
|
|
91
|
+
the date we make the Software available. On or after that date, you may use the
|
|
92
|
+
Software under the Apache License, Version 2.0, in which case the following
|
|
93
|
+
will apply:
|
|
94
|
+
|
|
95
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use
|
|
96
|
+
this file except in compliance with the License.
|
|
97
|
+
|
|
98
|
+
You may obtain a copy of the License at
|
|
99
|
+
|
|
100
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
101
|
+
|
|
102
|
+
Unless required by applicable law or agreed to in writing, software distributed
|
|
103
|
+
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
|
104
|
+
CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
105
|
+
specific language governing permissions and limitations under the License.
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Third-Party Attribution
|
|
110
|
+
|
|
111
|
+
This software includes code derived from
|
|
112
|
+
[pyicumessageformat](https://github.com/SirStendec/pyicumessageformat),
|
|
113
|
+
Copyright (c) 2021 Mike deBeaubien, licensed under the MIT License:
|
|
114
|
+
|
|
115
|
+
> Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
116
|
+
> of this software and associated documentation files (the "Software"), to deal
|
|
117
|
+
> in the Software without restriction, including without limitation the rights
|
|
118
|
+
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
119
|
+
> copies of the Software, and to permit persons to whom the Software is
|
|
120
|
+
> furnished to do so, subject to the following conditions:
|
|
121
|
+
>
|
|
122
|
+
> The above copyright notice and this permission notice shall be included in all
|
|
123
|
+
> copies or substantial portions of the Software.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: generaltranslation-icu-messageformat-parser
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: ICU MessageFormat parser with whitespace-preserving AST and string reconstruction
|
|
5
|
+
Author: General Translation, Inc.
|
|
6
|
+
Author-email: General Translation, Inc. <support@generaltranslation.com>
|
|
7
|
+
License-Expression: FSL-1.1-ALv2
|
|
8
|
+
License-File: LICENSE.md
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# generaltranslation-icu-messageformat-parser
|
|
13
|
+
|
|
14
|
+
A pure-Python ICU MessageFormat parser with whitespace-preserving AST and string reconstruction. Python equivalent of [`@formatjs/icu-messageformat-parser`](https://www.npmjs.com/package/@formatjs/icu-messageformat-parser).
|
|
15
|
+
|
|
16
|
+
Derived from [pyicumessageformat](https://github.com/SirStendec/pyicumessageformat) by Mike deBeaubien (MIT license).
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install generaltranslation-icu-messageformat-parser
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
No dependencies. Pure Python. Requires Python 3.10+.
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from generaltranslation_icu_messageformat_parser import Parser, print_ast
|
|
30
|
+
|
|
31
|
+
parser = Parser()
|
|
32
|
+
ast = parser.parse("{count, plural, one {# item} other {# items}}")
|
|
33
|
+
# [{'name': 'count', 'type': 'plural', 'offset': 0, 'options': {'one': [{'type': 'number', 'name': 'count', 'hash': True}, ' item'], 'other': [{'type': 'number', 'name': 'count', 'hash': True}, ' items']}}]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## API
|
|
37
|
+
|
|
38
|
+
### `Parser(options=None)`
|
|
39
|
+
|
|
40
|
+
Create a parser instance with optional configuration.
|
|
41
|
+
|
|
42
|
+
**Options dict keys:**
|
|
43
|
+
|
|
44
|
+
| Option | Type | Default | Description |
|
|
45
|
+
|---|---|---|---|
|
|
46
|
+
| `subnumeric_types` | `list[str]` | `['plural', 'selectordinal']` | Types that support `#` hash replacement |
|
|
47
|
+
| `submessage_types` | `list[str]` | `['plural', 'selectordinal', 'select']` | Types with sub-message branches |
|
|
48
|
+
| `maximum_depth` | `int` | `50` | Maximum nesting depth |
|
|
49
|
+
| `allow_tags` | `bool` | `False` | Enable XML-style `<tag>` parsing |
|
|
50
|
+
| `strict_tags` | `bool` | `False` | Strict tag parsing mode |
|
|
51
|
+
| `tag_prefix` | `str \| None` | `None` | Required tag name prefix |
|
|
52
|
+
| `tag_type` | `str` | `'tag'` | AST node type string for tags |
|
|
53
|
+
| `include_indices` | `bool` | `False` | Include `start`/`end` positions in AST nodes |
|
|
54
|
+
| `loose_submessages` | `bool` | `False` | Allow loose submessage parsing |
|
|
55
|
+
| `allow_format_spaces` | `bool` | `True` | Allow spaces in format strings |
|
|
56
|
+
| `require_other` | `bool` | `True` | Require `other` branch in plural/select |
|
|
57
|
+
| `preserve_whitespace` | `bool` | `False` | Store whitespace in `_ws` dict on AST nodes for lossless round-trips |
|
|
58
|
+
|
|
59
|
+
### `Parser.parse(input, tokens=None)`
|
|
60
|
+
|
|
61
|
+
Parse an ICU MessageFormat string into an AST.
|
|
62
|
+
|
|
63
|
+
**Args:**
|
|
64
|
+
- `input` (`str`): The ICU MessageFormat string to parse.
|
|
65
|
+
- `tokens` (`list | None`): Optional list to populate with token objects for low-level analysis.
|
|
66
|
+
|
|
67
|
+
**Returns:** `list` — A list of AST nodes (strings and dicts).
|
|
68
|
+
|
|
69
|
+
**Raises:** `SyntaxError` on malformed input, `TypeError` if input is not a string.
|
|
70
|
+
|
|
71
|
+
### `print_ast(ast)`
|
|
72
|
+
|
|
73
|
+
Reconstruct an ICU MessageFormat string from an AST.
|
|
74
|
+
|
|
75
|
+
**Args:**
|
|
76
|
+
- `ast` (`list`): The AST as returned by `Parser.parse()`.
|
|
77
|
+
|
|
78
|
+
**Returns:** `str` — The reconstructed ICU MessageFormat string.
|
|
79
|
+
|
|
80
|
+
When the AST contains `_ws` whitespace metadata (from `preserve_whitespace=True`), reconstruction is lossless — the output exactly matches the original input. Without whitespace metadata, normalized spacing is used.
|
|
81
|
+
|
|
82
|
+
## AST Node Types
|
|
83
|
+
|
|
84
|
+
### String literal
|
|
85
|
+
Plain strings appear directly in the AST list:
|
|
86
|
+
```python
|
|
87
|
+
parser.parse("Hello world")
|
|
88
|
+
# ["Hello world"]
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Simple variable `{name}`
|
|
92
|
+
```python
|
|
93
|
+
{"name": "username"}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Typed placeholder `{name, type, style}`
|
|
97
|
+
```python
|
|
98
|
+
{"name": "amount", "type": "number", "format": "::currency/USD"}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Plural / selectordinal `{n, plural, ...}`
|
|
102
|
+
```python
|
|
103
|
+
{
|
|
104
|
+
"name": "count",
|
|
105
|
+
"type": "plural", # or "selectordinal"
|
|
106
|
+
"offset": 0, # offset value (0 if none)
|
|
107
|
+
"options": {
|
|
108
|
+
"one": [{"type": "number", "name": "count", "hash": True}, " item"],
|
|
109
|
+
"other": [{"type": "number", "name": "count", "hash": True}, " items"],
|
|
110
|
+
"=0": ["no items"], # exact match keys
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Select `{gender, select, ...}`
|
|
116
|
+
```python
|
|
117
|
+
{
|
|
118
|
+
"name": "gender",
|
|
119
|
+
"type": "select",
|
|
120
|
+
"options": {
|
|
121
|
+
"male": ["He"],
|
|
122
|
+
"female": ["She"],
|
|
123
|
+
"other": ["They"],
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Hash `#` (inside plural/selectordinal)
|
|
129
|
+
```python
|
|
130
|
+
{"type": "number", "name": "count", "hash": True}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### With `include_indices=True`
|
|
134
|
+
All dict nodes gain `start` and `end` integer fields indicating byte positions in the original string.
|
|
135
|
+
|
|
136
|
+
### With `preserve_whitespace=True`
|
|
137
|
+
Dict nodes gain a `_ws` dict storing whitespace at each structural position, enabling lossless `print_ast()` round-trips.
|
|
138
|
+
|
|
139
|
+
## Supported ICU Features
|
|
140
|
+
|
|
141
|
+
- Simple variable interpolation: `{name}`
|
|
142
|
+
- Plural with CLDR categories: `{n, plural, one {...} other {...}}`
|
|
143
|
+
- Exact match: `{n, plural, =0 {...} =1 {...} other {...}}`
|
|
144
|
+
- Plural offset: `{n, plural, offset:1 ...}`
|
|
145
|
+
- Selectordinal: `{n, selectordinal, one {#st} two {#nd} few {#rd} other {#th}}`
|
|
146
|
+
- Select: `{gender, select, male {...} female {...} other {...}}`
|
|
147
|
+
- Nested expressions: plural inside select, select inside plural, etc.
|
|
148
|
+
- Typed placeholders: `{amount, number}`, `{d, date, short}`
|
|
149
|
+
- ICU escape sequences: `''` for literal quote, `'{...}'` for literal braces
|
|
150
|
+
- Hash `#` replacement inside plural/selectordinal branches
|
|
151
|
+
- XML-style tags (opt-in): `<bold>text</bold>`
|
|
152
|
+
|
|
153
|
+
## Known Limitations
|
|
154
|
+
|
|
155
|
+
- **Escape sequences are consumed during parsing.** `''` becomes `'` and `'{...}'` becomes `{...}` in the AST. These cannot be reconstructed by `print_ast()`. This matches the behavior of `@formatjs/icu-messageformat-parser`.
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# generaltranslation-icu-messageformat-parser
|
|
2
|
+
|
|
3
|
+
A pure-Python ICU MessageFormat parser with whitespace-preserving AST and string reconstruction. Python equivalent of [`@formatjs/icu-messageformat-parser`](https://www.npmjs.com/package/@formatjs/icu-messageformat-parser).
|
|
4
|
+
|
|
5
|
+
Derived from [pyicumessageformat](https://github.com/SirStendec/pyicumessageformat) by Mike deBeaubien (MIT license).
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install generaltranslation-icu-messageformat-parser
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
No dependencies. Pure Python. Requires Python 3.10+.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from generaltranslation_icu_messageformat_parser import Parser, print_ast
|
|
19
|
+
|
|
20
|
+
parser = Parser()
|
|
21
|
+
ast = parser.parse("{count, plural, one {# item} other {# items}}")
|
|
22
|
+
# [{'name': 'count', 'type': 'plural', 'offset': 0, 'options': {'one': [{'type': 'number', 'name': 'count', 'hash': True}, ' item'], 'other': [{'type': 'number', 'name': 'count', 'hash': True}, ' items']}}]
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## API
|
|
26
|
+
|
|
27
|
+
### `Parser(options=None)`
|
|
28
|
+
|
|
29
|
+
Create a parser instance with optional configuration.
|
|
30
|
+
|
|
31
|
+
**Options dict keys:**
|
|
32
|
+
|
|
33
|
+
| Option | Type | Default | Description |
|
|
34
|
+
|---|---|---|---|
|
|
35
|
+
| `subnumeric_types` | `list[str]` | `['plural', 'selectordinal']` | Types that support `#` hash replacement |
|
|
36
|
+
| `submessage_types` | `list[str]` | `['plural', 'selectordinal', 'select']` | Types with sub-message branches |
|
|
37
|
+
| `maximum_depth` | `int` | `50` | Maximum nesting depth |
|
|
38
|
+
| `allow_tags` | `bool` | `False` | Enable XML-style `<tag>` parsing |
|
|
39
|
+
| `strict_tags` | `bool` | `False` | Strict tag parsing mode |
|
|
40
|
+
| `tag_prefix` | `str \| None` | `None` | Required tag name prefix |
|
|
41
|
+
| `tag_type` | `str` | `'tag'` | AST node type string for tags |
|
|
42
|
+
| `include_indices` | `bool` | `False` | Include `start`/`end` positions in AST nodes |
|
|
43
|
+
| `loose_submessages` | `bool` | `False` | Allow loose submessage parsing |
|
|
44
|
+
| `allow_format_spaces` | `bool` | `True` | Allow spaces in format strings |
|
|
45
|
+
| `require_other` | `bool` | `True` | Require `other` branch in plural/select |
|
|
46
|
+
| `preserve_whitespace` | `bool` | `False` | Store whitespace in `_ws` dict on AST nodes for lossless round-trips |
|
|
47
|
+
|
|
48
|
+
### `Parser.parse(input, tokens=None)`
|
|
49
|
+
|
|
50
|
+
Parse an ICU MessageFormat string into an AST.
|
|
51
|
+
|
|
52
|
+
**Args:**
|
|
53
|
+
- `input` (`str`): The ICU MessageFormat string to parse.
|
|
54
|
+
- `tokens` (`list | None`): Optional list to populate with token objects for low-level analysis.
|
|
55
|
+
|
|
56
|
+
**Returns:** `list` — A list of AST nodes (strings and dicts).
|
|
57
|
+
|
|
58
|
+
**Raises:** `SyntaxError` on malformed input, `TypeError` if input is not a string.
|
|
59
|
+
|
|
60
|
+
### `print_ast(ast)`
|
|
61
|
+
|
|
62
|
+
Reconstruct an ICU MessageFormat string from an AST.
|
|
63
|
+
|
|
64
|
+
**Args:**
|
|
65
|
+
- `ast` (`list`): The AST as returned by `Parser.parse()`.
|
|
66
|
+
|
|
67
|
+
**Returns:** `str` — The reconstructed ICU MessageFormat string.
|
|
68
|
+
|
|
69
|
+
When the AST contains `_ws` whitespace metadata (from `preserve_whitespace=True`), reconstruction is lossless — the output exactly matches the original input. Without whitespace metadata, normalized spacing is used.
|
|
70
|
+
|
|
71
|
+
## AST Node Types
|
|
72
|
+
|
|
73
|
+
### String literal
|
|
74
|
+
Plain strings appear directly in the AST list:
|
|
75
|
+
```python
|
|
76
|
+
parser.parse("Hello world")
|
|
77
|
+
# ["Hello world"]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Simple variable `{name}`
|
|
81
|
+
```python
|
|
82
|
+
{"name": "username"}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Typed placeholder `{name, type, style}`
|
|
86
|
+
```python
|
|
87
|
+
{"name": "amount", "type": "number", "format": "::currency/USD"}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Plural / selectordinal `{n, plural, ...}`
|
|
91
|
+
```python
|
|
92
|
+
{
|
|
93
|
+
"name": "count",
|
|
94
|
+
"type": "plural", # or "selectordinal"
|
|
95
|
+
"offset": 0, # offset value (0 if none)
|
|
96
|
+
"options": {
|
|
97
|
+
"one": [{"type": "number", "name": "count", "hash": True}, " item"],
|
|
98
|
+
"other": [{"type": "number", "name": "count", "hash": True}, " items"],
|
|
99
|
+
"=0": ["no items"], # exact match keys
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Select `{gender, select, ...}`
|
|
105
|
+
```python
|
|
106
|
+
{
|
|
107
|
+
"name": "gender",
|
|
108
|
+
"type": "select",
|
|
109
|
+
"options": {
|
|
110
|
+
"male": ["He"],
|
|
111
|
+
"female": ["She"],
|
|
112
|
+
"other": ["They"],
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Hash `#` (inside plural/selectordinal)
|
|
118
|
+
```python
|
|
119
|
+
{"type": "number", "name": "count", "hash": True}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### With `include_indices=True`
|
|
123
|
+
All dict nodes gain `start` and `end` integer fields indicating byte positions in the original string.
|
|
124
|
+
|
|
125
|
+
### With `preserve_whitespace=True`
|
|
126
|
+
Dict nodes gain a `_ws` dict storing whitespace at each structural position, enabling lossless `print_ast()` round-trips.
|
|
127
|
+
|
|
128
|
+
## Supported ICU Features
|
|
129
|
+
|
|
130
|
+
- Simple variable interpolation: `{name}`
|
|
131
|
+
- Plural with CLDR categories: `{n, plural, one {...} other {...}}`
|
|
132
|
+
- Exact match: `{n, plural, =0 {...} =1 {...} other {...}}`
|
|
133
|
+
- Plural offset: `{n, plural, offset:1 ...}`
|
|
134
|
+
- Selectordinal: `{n, selectordinal, one {#st} two {#nd} few {#rd} other {#th}}`
|
|
135
|
+
- Select: `{gender, select, male {...} female {...} other {...}}`
|
|
136
|
+
- Nested expressions: plural inside select, select inside plural, etc.
|
|
137
|
+
- Typed placeholders: `{amount, number}`, `{d, date, short}`
|
|
138
|
+
- ICU escape sequences: `''` for literal quote, `'{...}'` for literal braces
|
|
139
|
+
- Hash `#` replacement inside plural/selectordinal branches
|
|
140
|
+
- XML-style tags (opt-in): `<bold>text</bold>`
|
|
141
|
+
|
|
142
|
+
## Known Limitations
|
|
143
|
+
|
|
144
|
+
- **Escape sequences are consumed during parsing.** `''` becomes `'` and `'{...}'` becomes `{...}` in the AST. These cannot be reconstructed by `print_ast()`. This matches the behavior of `@formatjs/icu-messageformat-parser`.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "generaltranslation-icu-messageformat-parser"
|
|
3
|
+
version = "0.0.0"
|
|
4
|
+
description = "ICU MessageFormat parser with whitespace-preserving AST and string reconstruction"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "General Translation, Inc.", email = "support@generaltranslation.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = "FSL-1.1-ALv2"
|
|
11
|
+
license-files = ["LICENSE.md"]
|
|
12
|
+
dependencies = []
|
|
13
|
+
|
|
14
|
+
[build-system]
|
|
15
|
+
requires = ["uv_build>=0.10.8,<0.11.0"]
|
|
16
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""ICU MessageFormat parser with whitespace-preserving AST and string reconstruction.
|
|
2
|
+
|
|
3
|
+
A Python equivalent of ``@formatjs/icu-messageformat-parser``. Parses ICU
|
|
4
|
+
MessageFormat strings into ASTs and reconstructs strings from ASTs.
|
|
5
|
+
|
|
6
|
+
Derived from `pyicumessageformat <https://github.com/SirStendec/pyicumessageformat>`_
|
|
7
|
+
by Mike deBeaubien (MIT license).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from generaltranslation_icu_messageformat_parser._parser import Parser
|
|
11
|
+
from generaltranslation_icu_messageformat_parser._printer import print_ast
|
|
12
|
+
|
|
13
|
+
__all__ = ["Parser", "print_ast"]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""ICU MessageFormat parser constants.
|
|
2
|
+
|
|
3
|
+
Derived from pyicumessageformat by Mike deBeaubien (MIT).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
CHAR_OPEN = "{"
|
|
7
|
+
CHAR_CLOSE = "}"
|
|
8
|
+
CHAR_TAG_OPEN = "<"
|
|
9
|
+
CHAR_TAG_CLOSE = "/"
|
|
10
|
+
CHAR_TAG_END = ">"
|
|
11
|
+
CHAR_SEP = ","
|
|
12
|
+
CHAR_HASH = "#"
|
|
13
|
+
CHAR_ESCAPE = "'"
|
|
14
|
+
|
|
15
|
+
OFFSET = "offset:"
|
|
16
|
+
|
|
17
|
+
VAR_CHARS = [CHAR_OPEN, CHAR_CLOSE]
|
|
18
|
+
|
|
19
|
+
TAG_CHARS = [CHAR_TAG_OPEN, CHAR_TAG_CLOSE, CHAR_TAG_END]
|
|
20
|
+
|
|
21
|
+
TAG_CLOSING = CHAR_TAG_CLOSE + CHAR_TAG_END
|
|
22
|
+
TAG_END = CHAR_TAG_OPEN + CHAR_TAG_CLOSE
|
|
23
|
+
|
|
24
|
+
SPACE_CHARS = [
|
|
25
|
+
0x20,
|
|
26
|
+
0x85,
|
|
27
|
+
0xA0,
|
|
28
|
+
0x180E,
|
|
29
|
+
0x2028,
|
|
30
|
+
0x2029,
|
|
31
|
+
0x202F,
|
|
32
|
+
0x205F,
|
|
33
|
+
0x2060,
|
|
34
|
+
0x3000,
|
|
35
|
+
0xFEFF,
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
CLOSE_TAG = {}
|
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
"""ICU MessageFormat parser.
|
|
2
|
+
|
|
3
|
+
Derived from pyicumessageformat by Mike deBeaubien (MIT).
|
|
4
|
+
Enhanced with whitespace-preserving AST support.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from . import _constants as constants
|
|
10
|
+
|
|
11
|
+
SEP_OR_CLOSE = f"{constants.CHAR_SEP} or {constants.CHAR_CLOSE}"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _append_token(context: dict, type: str, text: str) -> None:
|
|
15
|
+
if "tokens" in context:
|
|
16
|
+
context["tokens"].append({"type": type, "text": text})
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is_alpha(char: str) -> bool:
|
|
20
|
+
if not char:
|
|
21
|
+
return False
|
|
22
|
+
code = ord(char)
|
|
23
|
+
return (97 <= code <= 122) or (65 <= code <= 90)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _is_digit(char: str) -> bool:
|
|
27
|
+
if not char:
|
|
28
|
+
return False
|
|
29
|
+
code = ord(char)
|
|
30
|
+
return 0x30 <= code <= 0x39
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _is_space(char: str) -> bool:
|
|
34
|
+
if not char:
|
|
35
|
+
return False
|
|
36
|
+
code = ord(char)
|
|
37
|
+
return (
|
|
38
|
+
code in constants.SPACE_CHARS
|
|
39
|
+
or (0x09 <= code <= 0x0D)
|
|
40
|
+
or (0x2000 <= code <= 0x200D)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _skip_space(context: dict, ret: bool = False) -> str:
|
|
45
|
+
msg = context["msg"]
|
|
46
|
+
length = context["length"]
|
|
47
|
+
start = context["i"]
|
|
48
|
+
if start >= length:
|
|
49
|
+
return ""
|
|
50
|
+
|
|
51
|
+
while context["i"] < length and _is_space(msg[context["i"]]):
|
|
52
|
+
context["i"] += 1
|
|
53
|
+
|
|
54
|
+
captured = msg[start : context["i"]]
|
|
55
|
+
if ret:
|
|
56
|
+
return captured
|
|
57
|
+
elif start < context["i"]:
|
|
58
|
+
_append_token(context, "space", captured)
|
|
59
|
+
return captured
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _recursion(context: dict) -> SyntaxError:
|
|
63
|
+
return SyntaxError(f"Too much recursion at position {context['i']}")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _unexpected(char, index=None) -> SyntaxError:
|
|
67
|
+
if isinstance(char, dict):
|
|
68
|
+
index = char["i"]
|
|
69
|
+
c = char["msg"][index] if index < char["length"] else "<EOF>"
|
|
70
|
+
return _unexpected(c, index)
|
|
71
|
+
return SyntaxError(f'Unexpected "{char}" at position {index}')
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _expected(char, found, index=None) -> SyntaxError:
|
|
75
|
+
if isinstance(found, dict):
|
|
76
|
+
index = found["i"]
|
|
77
|
+
f = found["msg"][index] if index < found["length"] else "<EOF>"
|
|
78
|
+
return _expected(char, f, index)
|
|
79
|
+
return SyntaxError(
|
|
80
|
+
f'Expected {char} at position {index} but found "{found if found else "<EOF>"}"'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class Parser:
|
|
85
|
+
"""ICU MessageFormat parser.
|
|
86
|
+
|
|
87
|
+
Parses ICU MessageFormat strings into ASTs (list of dicts/strings).
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
options: Parser configuration dict. Supported keys:
|
|
91
|
+
|
|
92
|
+
- ``subnumeric_types``: Types that support ``#`` (default: ``['plural', 'selectordinal']``)
|
|
93
|
+
- ``submessage_types``: Types with sub-messages (default: ``['plural', 'selectordinal', 'select']``)
|
|
94
|
+
- ``maximum_depth``: Max nesting depth (default: ``50``)
|
|
95
|
+
- ``allow_tags``: Enable XML-style tag parsing (default: ``False``)
|
|
96
|
+
- ``strict_tags``: Strict tag parsing mode (default: ``False``)
|
|
97
|
+
- ``tag_prefix``: Required tag name prefix (default: ``None``)
|
|
98
|
+
- ``tag_type``: AST node type for tags (default: ``'tag'``)
|
|
99
|
+
- ``include_indices``: Include ``start``/``end`` in AST nodes (default: ``False``)
|
|
100
|
+
- ``loose_submessages``: Allow loose submessage parsing (default: ``False``)
|
|
101
|
+
- ``allow_format_spaces``: Allow spaces in format strings (default: ``True``)
|
|
102
|
+
- ``require_other``: Require ``other`` branch (default: ``True``)
|
|
103
|
+
- ``preserve_whitespace``: Store whitespace in ``_ws`` dict on AST nodes (default: ``False``)
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, options: dict | None = None) -> None:
|
|
107
|
+
self.options = {
|
|
108
|
+
"subnumeric_types": ["plural", "selectordinal"],
|
|
109
|
+
"submessage_types": ["plural", "selectordinal", "select"],
|
|
110
|
+
"maximum_depth": 50,
|
|
111
|
+
"allow_tags": False,
|
|
112
|
+
"strict_tags": False,
|
|
113
|
+
"tag_prefix": None,
|
|
114
|
+
"tag_type": "tag",
|
|
115
|
+
"include_indices": False,
|
|
116
|
+
"loose_submessages": False,
|
|
117
|
+
"allow_format_spaces": True,
|
|
118
|
+
"require_other": True,
|
|
119
|
+
"preserve_whitespace": False,
|
|
120
|
+
}
|
|
121
|
+
if isinstance(options, dict):
|
|
122
|
+
self.options.update(options)
|
|
123
|
+
|
|
124
|
+
def parse(self, input: str, tokens: list | None = None) -> list:
|
|
125
|
+
"""Parse an ICU MessageFormat string into an AST.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
input: The ICU MessageFormat string to parse.
|
|
129
|
+
tokens: Optional list to populate with token objects.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
A list of AST nodes (strings and dicts).
|
|
133
|
+
"""
|
|
134
|
+
if not isinstance(input, str):
|
|
135
|
+
raise TypeError("input must be string")
|
|
136
|
+
|
|
137
|
+
context = {
|
|
138
|
+
"msg": input,
|
|
139
|
+
"length": len(input),
|
|
140
|
+
"i": 0,
|
|
141
|
+
"depth": 0,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if tokens is not None:
|
|
145
|
+
if not isinstance(tokens, list):
|
|
146
|
+
raise TypeError("tokens must be list or None")
|
|
147
|
+
context["tokens"] = tokens
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
return self._parse_ast(context, None)
|
|
151
|
+
except RecursionError:
|
|
152
|
+
raise _recursion(context)
|
|
153
|
+
except IndexError:
|
|
154
|
+
raise SyntaxError
|
|
155
|
+
|
|
156
|
+
def _parse_ast(self, context: dict, parent: dict | None) -> list:
|
|
157
|
+
msg = context["msg"]
|
|
158
|
+
length = context["length"]
|
|
159
|
+
start = context["i"]
|
|
160
|
+
out: list = []
|
|
161
|
+
|
|
162
|
+
text = self._parse_text(context, parent)
|
|
163
|
+
if text:
|
|
164
|
+
out.append(text)
|
|
165
|
+
_append_token(context, "text", msg[start : context["i"]])
|
|
166
|
+
|
|
167
|
+
while context["i"] < length:
|
|
168
|
+
i = context["i"]
|
|
169
|
+
char = msg[i]
|
|
170
|
+
if char == constants.CHAR_CLOSE:
|
|
171
|
+
if not parent:
|
|
172
|
+
raise _unexpected(context)
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
if (
|
|
176
|
+
parent
|
|
177
|
+
and self.options["allow_tags"]
|
|
178
|
+
and msg[i : i + len(constants.TAG_END)] == constants.TAG_END
|
|
179
|
+
and self._can_read_tag(context, parent, True)
|
|
180
|
+
):
|
|
181
|
+
break
|
|
182
|
+
|
|
183
|
+
out.append(self._parse_placeholder(context, parent))
|
|
184
|
+
start = context["i"]
|
|
185
|
+
text = self._parse_text(context, parent)
|
|
186
|
+
if text:
|
|
187
|
+
out.append(text)
|
|
188
|
+
_append_token(context, "text", msg[start : context["i"]])
|
|
189
|
+
|
|
190
|
+
return out
|
|
191
|
+
|
|
192
|
+
def _can_read_tag(
|
|
193
|
+
self, context: dict, parent: dict | None, require_closing: bool = False
|
|
194
|
+
) -> bool:
|
|
195
|
+
msg = context["msg"]
|
|
196
|
+
length = context["length"]
|
|
197
|
+
current = context["i"]
|
|
198
|
+
if not self.options["allow_tags"]:
|
|
199
|
+
return False
|
|
200
|
+
|
|
201
|
+
char = msg[current] if current < length else None
|
|
202
|
+
if char != constants.CHAR_TAG_OPEN:
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
if self.options["strict_tags"] and not require_closing:
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
current += 1
|
|
209
|
+
char = msg[current] if current < length else None
|
|
210
|
+
|
|
211
|
+
if char == constants.CHAR_TAG_CLOSE:
|
|
212
|
+
if self.options["strict_tags"]:
|
|
213
|
+
return True
|
|
214
|
+
current += 1
|
|
215
|
+
char = msg[current] if current < length else None
|
|
216
|
+
elif require_closing:
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
if self.options["tag_prefix"]:
|
|
220
|
+
prefix = self.options["tag_prefix"]
|
|
221
|
+
return prefix == msg[current : current + len(prefix)]
|
|
222
|
+
elif _is_alpha(char):
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
return False
|
|
226
|
+
|
|
227
|
+
def _parse_text(
|
|
228
|
+
self,
|
|
229
|
+
context: dict,
|
|
230
|
+
parent: dict | None,
|
|
231
|
+
is_arg_style: bool = False,
|
|
232
|
+
) -> str:
|
|
233
|
+
msg = context["msg"]
|
|
234
|
+
length = context["length"]
|
|
235
|
+
start = context["i"]
|
|
236
|
+
is_hash_special = (
|
|
237
|
+
parent and parent["type"] in self.options["subnumeric_types"]
|
|
238
|
+
)
|
|
239
|
+
is_tag_special = self.options["allow_tags"]
|
|
240
|
+
allow_arg_spaces = self.options["allow_format_spaces"]
|
|
241
|
+
|
|
242
|
+
text = ""
|
|
243
|
+
trailing_space = 0
|
|
244
|
+
|
|
245
|
+
while context["i"] < length:
|
|
246
|
+
char = msg[context["i"]]
|
|
247
|
+
is_sp = _is_space(char)
|
|
248
|
+
|
|
249
|
+
if (
|
|
250
|
+
char in constants.VAR_CHARS
|
|
251
|
+
or (is_hash_special and char == constants.CHAR_HASH)
|
|
252
|
+
or (
|
|
253
|
+
is_tag_special
|
|
254
|
+
and char == constants.CHAR_TAG_OPEN
|
|
255
|
+
and self._can_read_tag(context, parent)
|
|
256
|
+
)
|
|
257
|
+
or (is_arg_style and not allow_arg_spaces and is_sp)
|
|
258
|
+
):
|
|
259
|
+
break
|
|
260
|
+
|
|
261
|
+
if is_sp:
|
|
262
|
+
trailing_space += 1
|
|
263
|
+
else:
|
|
264
|
+
trailing_space = 0
|
|
265
|
+
|
|
266
|
+
if char == constants.CHAR_ESCAPE:
|
|
267
|
+
context["i"] += 1
|
|
268
|
+
if context["i"] < length:
|
|
269
|
+
char = msg[context["i"]]
|
|
270
|
+
if char == constants.CHAR_ESCAPE:
|
|
271
|
+
text += char
|
|
272
|
+
context["i"] += 1
|
|
273
|
+
elif (
|
|
274
|
+
char in constants.VAR_CHARS
|
|
275
|
+
or (is_hash_special and char == constants.CHAR_HASH)
|
|
276
|
+
or char == constants.CHAR_TAG_OPEN
|
|
277
|
+
or char == constants.CHAR_TAG_END
|
|
278
|
+
or is_arg_style
|
|
279
|
+
):
|
|
280
|
+
text += char
|
|
281
|
+
context["i"] += 1
|
|
282
|
+
while context["i"] < length:
|
|
283
|
+
nxt = msg[context["i"]]
|
|
284
|
+
if nxt == constants.CHAR_ESCAPE:
|
|
285
|
+
context["i"] += 1
|
|
286
|
+
if (
|
|
287
|
+
context["i"] < length
|
|
288
|
+
and msg[context["i"]] == constants.CHAR_ESCAPE
|
|
289
|
+
):
|
|
290
|
+
text += nxt
|
|
291
|
+
else:
|
|
292
|
+
break
|
|
293
|
+
else:
|
|
294
|
+
text += nxt
|
|
295
|
+
context["i"] += 1
|
|
296
|
+
else:
|
|
297
|
+
context["i"] += 1
|
|
298
|
+
text += constants.CHAR_ESCAPE + char
|
|
299
|
+
else:
|
|
300
|
+
text += char
|
|
301
|
+
else:
|
|
302
|
+
text += char
|
|
303
|
+
context["i"] += 1
|
|
304
|
+
|
|
305
|
+
if is_arg_style and trailing_space:
|
|
306
|
+
trimmed = len(text) - trailing_space
|
|
307
|
+
if trimmed <= 0:
|
|
308
|
+
context["i"] = start
|
|
309
|
+
return ""
|
|
310
|
+
else:
|
|
311
|
+
context["i"] -= trailing_space
|
|
312
|
+
return text[:trimmed]
|
|
313
|
+
|
|
314
|
+
return text
|
|
315
|
+
|
|
316
|
+
def _token_indices(self, token: dict, start: int, end: int) -> dict:
|
|
317
|
+
if self.options["include_indices"]:
|
|
318
|
+
token["start"] = start
|
|
319
|
+
token["end"] = end
|
|
320
|
+
return token
|
|
321
|
+
|
|
322
|
+
def _parse_placeholder(self, context: dict, parent: dict | None) -> dict:
|
|
323
|
+
msg = context["msg"]
|
|
324
|
+
length = context["length"]
|
|
325
|
+
preserve_ws = self.options["preserve_whitespace"]
|
|
326
|
+
is_hash_special = (
|
|
327
|
+
parent and parent["type"] in self.options["subnumeric_types"]
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
start_idx = context["i"]
|
|
331
|
+
char = msg[start_idx] if start_idx < length else None
|
|
332
|
+
if is_hash_special and char == constants.CHAR_HASH:
|
|
333
|
+
_append_token(context, "hash", char)
|
|
334
|
+
context["i"] += 1
|
|
335
|
+
return self._token_indices(
|
|
336
|
+
{"type": "number", "name": parent["name"], "hash": True},
|
|
337
|
+
start_idx,
|
|
338
|
+
context["i"],
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
tag = self._parse_tag(context, parent)
|
|
342
|
+
if tag:
|
|
343
|
+
return tag
|
|
344
|
+
|
|
345
|
+
if char != constants.CHAR_OPEN:
|
|
346
|
+
raise _expected(constants.CHAR_OPEN, context)
|
|
347
|
+
|
|
348
|
+
_append_token(context, "syntax", char)
|
|
349
|
+
context["i"] += 1
|
|
350
|
+
|
|
351
|
+
ws: dict = {}
|
|
352
|
+
ws_before_name = _skip_space(context, ret=preserve_ws)
|
|
353
|
+
if preserve_ws:
|
|
354
|
+
ws["before_name"] = ws_before_name
|
|
355
|
+
|
|
356
|
+
name = self._parse_name(context)
|
|
357
|
+
if not name:
|
|
358
|
+
raise _expected("placeholder name", context)
|
|
359
|
+
|
|
360
|
+
_append_token(context, "name", name)
|
|
361
|
+
token: dict = {"name": name}
|
|
362
|
+
|
|
363
|
+
ws_after_name = _skip_space(context, ret=preserve_ws)
|
|
364
|
+
if preserve_ws:
|
|
365
|
+
ws["after_name"] = ws_after_name
|
|
366
|
+
|
|
367
|
+
char = msg[context["i"]] if context["i"] < length else None
|
|
368
|
+
|
|
369
|
+
if char == constants.CHAR_CLOSE:
|
|
370
|
+
_append_token(context, "syntax", char)
|
|
371
|
+
context["i"] += 1
|
|
372
|
+
if preserve_ws and ws:
|
|
373
|
+
token["_ws"] = ws
|
|
374
|
+
return self._token_indices(token, start_idx, context["i"])
|
|
375
|
+
|
|
376
|
+
if char != constants.CHAR_SEP:
|
|
377
|
+
raise _expected(SEP_OR_CLOSE, context)
|
|
378
|
+
|
|
379
|
+
_append_token(context, "syntax", char)
|
|
380
|
+
context["i"] += 1
|
|
381
|
+
|
|
382
|
+
ws_after_type_sep = _skip_space(context, ret=preserve_ws)
|
|
383
|
+
if preserve_ws:
|
|
384
|
+
ws["after_type_sep"] = ws_after_type_sep
|
|
385
|
+
|
|
386
|
+
ttype = self._parse_name(context)
|
|
387
|
+
if not ttype:
|
|
388
|
+
raise _expected("placeholder type", context)
|
|
389
|
+
|
|
390
|
+
_append_token(context, "type", ttype)
|
|
391
|
+
token["type"] = ttype
|
|
392
|
+
|
|
393
|
+
ws_after_type = _skip_space(context, ret=preserve_ws)
|
|
394
|
+
if preserve_ws:
|
|
395
|
+
ws["after_type"] = ws_after_type
|
|
396
|
+
|
|
397
|
+
char = msg[context["i"]] if context["i"] < length else None
|
|
398
|
+
if char == constants.CHAR_CLOSE:
|
|
399
|
+
_append_token(context, "syntax", char)
|
|
400
|
+
if ttype in self.options["submessage_types"]:
|
|
401
|
+
raise _expected(f"{ttype} sub-messages", context)
|
|
402
|
+
context["i"] += 1
|
|
403
|
+
if preserve_ws and ws:
|
|
404
|
+
token["_ws"] = ws
|
|
405
|
+
return self._token_indices(token, start_idx, context["i"])
|
|
406
|
+
|
|
407
|
+
if char != constants.CHAR_SEP:
|
|
408
|
+
raise _expected(SEP_OR_CLOSE, context)
|
|
409
|
+
|
|
410
|
+
_append_token(context, "syntax", char)
|
|
411
|
+
context["i"] += 1
|
|
412
|
+
|
|
413
|
+
ws_after_style_sep = _skip_space(context, ret=preserve_ws)
|
|
414
|
+
if preserve_ws:
|
|
415
|
+
ws["after_style_sep"] = ws_after_style_sep
|
|
416
|
+
|
|
417
|
+
if ttype in self.options["subnumeric_types"]:
|
|
418
|
+
offset = self._parse_offset(context)
|
|
419
|
+
token["offset"] = offset if offset else 0
|
|
420
|
+
if offset:
|
|
421
|
+
_skip_space(context)
|
|
422
|
+
|
|
423
|
+
if ttype in self.options["submessage_types"]:
|
|
424
|
+
messages = self._parse_submessages(context, token)
|
|
425
|
+
if not messages:
|
|
426
|
+
raise _expected(f"{ttype} sub-messages", context)
|
|
427
|
+
token["options"] = messages
|
|
428
|
+
else:
|
|
429
|
+
start = context["i"]
|
|
430
|
+
fmt = self._parse_text(context, token, True)
|
|
431
|
+
if not fmt:
|
|
432
|
+
raise _expected("placeholder style", context)
|
|
433
|
+
|
|
434
|
+
end = context["i"]
|
|
435
|
+
spaces = _skip_space(context, True)
|
|
436
|
+
|
|
437
|
+
if (
|
|
438
|
+
self.options["loose_submessages"]
|
|
439
|
+
and context["i"] < length
|
|
440
|
+
and msg[context["i"]] == constants.CHAR_OPEN
|
|
441
|
+
):
|
|
442
|
+
context["i"] = start
|
|
443
|
+
messages = self._parse_submessages(context, token)
|
|
444
|
+
if not messages:
|
|
445
|
+
raise _expected(f"{ttype} sub-messages", context)
|
|
446
|
+
token["options"] = messages
|
|
447
|
+
else:
|
|
448
|
+
token["format"] = fmt
|
|
449
|
+
_append_token(context, "style", msg[start:end])
|
|
450
|
+
if spaces:
|
|
451
|
+
_append_token(context, "space", spaces)
|
|
452
|
+
|
|
453
|
+
ws_before_close = _skip_space(context, ret=preserve_ws)
|
|
454
|
+
if preserve_ws:
|
|
455
|
+
ws["before_close"] = ws_before_close
|
|
456
|
+
|
|
457
|
+
char = msg[context["i"]] if context["i"] < length else None
|
|
458
|
+
if char != constants.CHAR_CLOSE:
|
|
459
|
+
raise _expected(constants.CHAR_CLOSE, context)
|
|
460
|
+
|
|
461
|
+
_append_token(context, "syntax", char)
|
|
462
|
+
context["i"] += 1
|
|
463
|
+
|
|
464
|
+
if preserve_ws and ws:
|
|
465
|
+
token["_ws"] = ws
|
|
466
|
+
|
|
467
|
+
return self._token_indices(token, start_idx, context["i"])
|
|
468
|
+
|
|
469
|
+
def _parse_tag(self, context: dict, parent: dict | None) -> dict | None:
|
|
470
|
+
if not self.options["allow_tags"]:
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
if not self._can_read_tag(context, parent):
|
|
474
|
+
return None
|
|
475
|
+
|
|
476
|
+
msg = context["msg"]
|
|
477
|
+
length = context["length"]
|
|
478
|
+
i = context["i"]
|
|
479
|
+
start_idx = i
|
|
480
|
+
char = msg[i] if i < length else None
|
|
481
|
+
|
|
482
|
+
if char != constants.CHAR_TAG_OPEN:
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
if msg[i : i + len(constants.TAG_END)] == constants.TAG_END:
|
|
486
|
+
raise _unexpected(constants.TAG_END, i)
|
|
487
|
+
|
|
488
|
+
context["i"] += 1
|
|
489
|
+
name = self._parse_name(context, True)
|
|
490
|
+
if not name:
|
|
491
|
+
if not self.options["strict_tags"]:
|
|
492
|
+
context["i"] = start_idx
|
|
493
|
+
return None
|
|
494
|
+
raise _expected("tag name", context)
|
|
495
|
+
|
|
496
|
+
_append_token(context, "syntax", char)
|
|
497
|
+
token: dict = {"type": self.options["tag_type"], "name": name}
|
|
498
|
+
_append_token(context, "name", name)
|
|
499
|
+
_skip_space(context)
|
|
500
|
+
|
|
501
|
+
i = context["i"]
|
|
502
|
+
if (
|
|
503
|
+
i < length
|
|
504
|
+
and msg[i : i + len(constants.TAG_CLOSING)] == constants.TAG_CLOSING
|
|
505
|
+
):
|
|
506
|
+
_append_token(context, "syntax", constants.TAG_CLOSING)
|
|
507
|
+
context["i"] += len(constants.TAG_CLOSING)
|
|
508
|
+
return self._token_indices(token, start_idx, context["i"])
|
|
509
|
+
|
|
510
|
+
char = msg[i] if i < length else None
|
|
511
|
+
if char != constants.CHAR_TAG_END:
|
|
512
|
+
raise _expected(
|
|
513
|
+
constants.CHAR_TAG_END + " or " + constants.TAG_CLOSING, context
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
_append_token(context, "syntax", char)
|
|
517
|
+
context["i"] += 1
|
|
518
|
+
|
|
519
|
+
children = self._parse_ast(context, token)
|
|
520
|
+
if children:
|
|
521
|
+
token["contents"] = children
|
|
522
|
+
end = context["i"]
|
|
523
|
+
|
|
524
|
+
if (
|
|
525
|
+
end < length
|
|
526
|
+
and msg[end : end + len(constants.TAG_END)] != constants.TAG_END
|
|
527
|
+
):
|
|
528
|
+
raise _expected(constants.TAG_END, context)
|
|
529
|
+
|
|
530
|
+
_append_token(context, "syntax", constants.TAG_END)
|
|
531
|
+
context["i"] += len(constants.TAG_END)
|
|
532
|
+
|
|
533
|
+
close_name = self._parse_name(context, True)
|
|
534
|
+
if close_name:
|
|
535
|
+
_append_token(context, "name", close_name)
|
|
536
|
+
if close_name != name:
|
|
537
|
+
raise _expected(
|
|
538
|
+
constants.TAG_END + name + constants.CHAR_TAG_END,
|
|
539
|
+
msg[end] if end < length else "<EOF>",
|
|
540
|
+
end,
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
_skip_space(context)
|
|
544
|
+
char = msg[context["i"]] if context["i"] < length else None
|
|
545
|
+
if char != constants.CHAR_TAG_END:
|
|
546
|
+
raise _expected(constants.CHAR_TAG_END, context)
|
|
547
|
+
|
|
548
|
+
_append_token(context, "syntax", char)
|
|
549
|
+
context["i"] += 1
|
|
550
|
+
return self._token_indices(token, start_idx, context["i"])
|
|
551
|
+
|
|
552
|
+
def _parse_name(self, context: dict, is_tag: bool = False) -> str:
|
|
553
|
+
msg = context["msg"]
|
|
554
|
+
length = context["length"]
|
|
555
|
+
name = ""
|
|
556
|
+
|
|
557
|
+
while context["i"] < length:
|
|
558
|
+
char = msg[context["i"]]
|
|
559
|
+
if (
|
|
560
|
+
char in constants.VAR_CHARS
|
|
561
|
+
or char == constants.CHAR_SEP
|
|
562
|
+
or char == constants.CHAR_HASH
|
|
563
|
+
or char == constants.CHAR_ESCAPE
|
|
564
|
+
or _is_space(char)
|
|
565
|
+
or (is_tag and char in constants.TAG_CHARS)
|
|
566
|
+
):
|
|
567
|
+
break
|
|
568
|
+
name += char
|
|
569
|
+
context["i"] += 1
|
|
570
|
+
|
|
571
|
+
return name
|
|
572
|
+
|
|
573
|
+
def _parse_offset(self, context: dict) -> int:
|
|
574
|
+
msg = context["msg"]
|
|
575
|
+
length = context["length"]
|
|
576
|
+
start = context["i"]
|
|
577
|
+
|
|
578
|
+
if start >= length or msg[start : start + len(constants.OFFSET)] != constants.OFFSET:
|
|
579
|
+
return 0
|
|
580
|
+
|
|
581
|
+
_append_token(context, "offset", constants.OFFSET)
|
|
582
|
+
context["i"] += len(constants.OFFSET)
|
|
583
|
+
_skip_space(context)
|
|
584
|
+
|
|
585
|
+
start = context["i"]
|
|
586
|
+
while context["i"] < length and (
|
|
587
|
+
_is_digit(msg[context["i"]])
|
|
588
|
+
or (context["i"] == start and msg[context["i"]] == "-")
|
|
589
|
+
):
|
|
590
|
+
context["i"] += 1
|
|
591
|
+
|
|
592
|
+
if start == context["i"]:
|
|
593
|
+
raise _expected("offset number", context)
|
|
594
|
+
|
|
595
|
+
offset = msg[start : context["i"]]
|
|
596
|
+
_append_token(context, "number", offset)
|
|
597
|
+
return int(offset, 10)
|
|
598
|
+
|
|
599
|
+
def _parse_submessages(self, context: dict, parent: dict) -> dict | None:
|
|
600
|
+
msg = context["msg"]
|
|
601
|
+
length = context["length"]
|
|
602
|
+
preserve_ws = self.options["preserve_whitespace"]
|
|
603
|
+
options: dict = {}
|
|
604
|
+
|
|
605
|
+
context["depth"] += 1
|
|
606
|
+
|
|
607
|
+
while context["i"] < length and msg[context["i"]] != constants.CHAR_CLOSE:
|
|
608
|
+
# Save position before consuming space so we can rewind if we hit }
|
|
609
|
+
pre_space_pos = context["i"]
|
|
610
|
+
ws_before_selector = _skip_space(context, ret=preserve_ws) if preserve_ws else _skip_space(context)
|
|
611
|
+
|
|
612
|
+
if context["i"] >= length or msg[context["i"]] == constants.CHAR_CLOSE:
|
|
613
|
+
# Rewind: this trailing space belongs to the outer placeholder's before_close
|
|
614
|
+
if preserve_ws:
|
|
615
|
+
context["i"] = pre_space_pos
|
|
616
|
+
break
|
|
617
|
+
|
|
618
|
+
selector = self._parse_name(context)
|
|
619
|
+
if not selector:
|
|
620
|
+
raise _expected("sub-message selector", context)
|
|
621
|
+
_append_token(context, "selector", selector)
|
|
622
|
+
|
|
623
|
+
ws_after_selector = _skip_space(context, ret=preserve_ws) if preserve_ws else _skip_space(context)
|
|
624
|
+
|
|
625
|
+
submessage = self._parse_submessage(context, parent)
|
|
626
|
+
|
|
627
|
+
if preserve_ws:
|
|
628
|
+
# Store whitespace metadata on the submessage list
|
|
629
|
+
# We use a wrapper dict so we can attach _ws
|
|
630
|
+
options[selector] = submessage
|
|
631
|
+
# Store selector whitespace as metadata on the options dict
|
|
632
|
+
if "_ws" not in options:
|
|
633
|
+
options["_ws"] = {}
|
|
634
|
+
options["_ws"][selector] = {
|
|
635
|
+
"before_selector": ws_before_selector or "",
|
|
636
|
+
"after_selector": ws_after_selector or "",
|
|
637
|
+
}
|
|
638
|
+
else:
|
|
639
|
+
options[selector] = submessage
|
|
640
|
+
|
|
641
|
+
if not preserve_ws:
|
|
642
|
+
_skip_space(context)
|
|
643
|
+
|
|
644
|
+
context["depth"] -= 1
|
|
645
|
+
|
|
646
|
+
if not options or (len(options) == 1 and "_ws" in options):
|
|
647
|
+
return None
|
|
648
|
+
|
|
649
|
+
req = self.options["require_other"]
|
|
650
|
+
ttype = parent["type"] if parent else None
|
|
651
|
+
if req == "all":
|
|
652
|
+
req = True
|
|
653
|
+
elif req == "subnumeric":
|
|
654
|
+
req = self.options["subnumeric_types"]
|
|
655
|
+
elif req and not isinstance(req, list):
|
|
656
|
+
req = self.options["submessage_types"]
|
|
657
|
+
if isinstance(req, list):
|
|
658
|
+
req = ttype in req
|
|
659
|
+
|
|
660
|
+
if req and "other" not in options:
|
|
661
|
+
raise _expected(f"{ttype} sub-message other", context)
|
|
662
|
+
|
|
663
|
+
return options
|
|
664
|
+
|
|
665
|
+
def _parse_submessage(self, context: dict, parent: dict) -> list:
|
|
666
|
+
if context["depth"] >= self.options["maximum_depth"]:
|
|
667
|
+
raise _recursion(context)
|
|
668
|
+
|
|
669
|
+
msg = context["msg"]
|
|
670
|
+
length = context["length"]
|
|
671
|
+
if context["i"] >= length or msg[context["i"]] != constants.CHAR_OPEN:
|
|
672
|
+
raise _expected(constants.CHAR_OPEN, context)
|
|
673
|
+
|
|
674
|
+
_append_token(context, "syntax", constants.CHAR_OPEN)
|
|
675
|
+
context["i"] += 1
|
|
676
|
+
|
|
677
|
+
message = self._parse_ast(context, parent)
|
|
678
|
+
|
|
679
|
+
char = msg[context["i"]] if context["i"] < length else None
|
|
680
|
+
if char != constants.CHAR_CLOSE:
|
|
681
|
+
raise _expected(constants.CHAR_CLOSE, context)
|
|
682
|
+
|
|
683
|
+
_append_token(context, "syntax", constants.CHAR_CLOSE)
|
|
684
|
+
context["i"] += 1
|
|
685
|
+
|
|
686
|
+
return message
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""AST to ICU MessageFormat string reconstruction.
|
|
2
|
+
|
|
3
|
+
Reconstructs an ICU MessageFormat string from an AST produced by
|
|
4
|
+
:class:`Parser`. When the AST contains ``_ws`` whitespace metadata
|
|
5
|
+
(from ``preserve_whitespace=True``), the reconstruction is lossless.
|
|
6
|
+
Otherwise, normalized whitespace (single spaces) is used.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from . import _constants as constants
|
|
14
|
+
|
|
15
|
+
_BRACE_RE = re.compile(r"([{}](?:[\s\S]*[{}])?)")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def print_ast(ast: list) -> str:
|
|
19
|
+
"""Reconstruct an ICU MessageFormat string from an AST.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
ast: The AST as returned by :meth:`Parser.parse`.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The reconstructed ICU MessageFormat string.
|
|
26
|
+
"""
|
|
27
|
+
return _print_nodes(ast)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _escape_message(message: str) -> str:
|
|
31
|
+
"""Wrap the first region containing ``{`` or ``}`` in single quotes.
|
|
32
|
+
|
|
33
|
+
Port of JS ``printEscapedMessage``.
|
|
34
|
+
"""
|
|
35
|
+
return _BRACE_RE.sub(r"'\1'", message, count=1)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _print_literal(
|
|
39
|
+
value: str, is_in_plural: bool, is_first: bool, is_last: bool
|
|
40
|
+
) -> str:
|
|
41
|
+
"""Re-escape a literal text node for safe ICU round-tripping.
|
|
42
|
+
|
|
43
|
+
Port of JS ``printLiteralElement``.
|
|
44
|
+
"""
|
|
45
|
+
escaped = value
|
|
46
|
+
|
|
47
|
+
# Double leading ' when not first element (prevents mis-parse after })
|
|
48
|
+
if not is_first and escaped and escaped[0] == "'":
|
|
49
|
+
escaped = "''" + escaped[1:]
|
|
50
|
+
|
|
51
|
+
# Double trailing ' when not last element (prevents mis-parse before {)
|
|
52
|
+
if not is_last and escaped and escaped[-1] == "'":
|
|
53
|
+
escaped = escaped[:-1] + "''"
|
|
54
|
+
|
|
55
|
+
# Re-escape {} regions
|
|
56
|
+
escaped = _escape_message(escaped)
|
|
57
|
+
|
|
58
|
+
# Re-escape # in plural context
|
|
59
|
+
if is_in_plural:
|
|
60
|
+
escaped = escaped.replace("#", "'#'")
|
|
61
|
+
|
|
62
|
+
return escaped
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _print_nodes(nodes: list, is_in_plural: bool = False) -> str:
|
|
66
|
+
parts: list[str] = []
|
|
67
|
+
for i, node in enumerate(nodes):
|
|
68
|
+
if isinstance(node, str):
|
|
69
|
+
parts.append(
|
|
70
|
+
_print_literal(node, is_in_plural, i == 0, i == len(nodes) - 1)
|
|
71
|
+
)
|
|
72
|
+
elif isinstance(node, dict):
|
|
73
|
+
parts.append(_print_node(node))
|
|
74
|
+
return "".join(parts)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _print_node(node: dict) -> str:
|
|
78
|
+
ws = node.get("_ws", {})
|
|
79
|
+
|
|
80
|
+
# Hash replacement node (#)
|
|
81
|
+
if node.get("hash"):
|
|
82
|
+
return constants.CHAR_HASH
|
|
83
|
+
|
|
84
|
+
name = node.get("name", "")
|
|
85
|
+
node_type = node.get("type")
|
|
86
|
+
|
|
87
|
+
# Tag node
|
|
88
|
+
if node_type == "tag" or (node_type and node.get("contents") is not None):
|
|
89
|
+
contents = node.get("contents", [])
|
|
90
|
+
return (
|
|
91
|
+
constants.CHAR_TAG_OPEN
|
|
92
|
+
+ name
|
|
93
|
+
+ constants.CHAR_TAG_END
|
|
94
|
+
+ _print_nodes(contents)
|
|
95
|
+
+ constants.TAG_END
|
|
96
|
+
+ name
|
|
97
|
+
+ constants.CHAR_TAG_END
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Simple variable: {name}
|
|
101
|
+
if node_type is None and "options" not in node and "format" not in node:
|
|
102
|
+
before = ws.get("before_name", "")
|
|
103
|
+
after = ws.get("after_name", "")
|
|
104
|
+
return constants.CHAR_OPEN + before + name + after + constants.CHAR_CLOSE
|
|
105
|
+
|
|
106
|
+
# Typed placeholder
|
|
107
|
+
# JS printAST uses no spaces after commas for select/plural but spaces
|
|
108
|
+
# for simple format types (number, date, time).
|
|
109
|
+
has_options = "options" in node
|
|
110
|
+
default_sep = "" if has_options else " "
|
|
111
|
+
|
|
112
|
+
before_name = ws.get("before_name", "")
|
|
113
|
+
after_name = ws.get("after_name", "")
|
|
114
|
+
after_type_sep = ws.get("after_type_sep", default_sep)
|
|
115
|
+
after_type = ws.get("after_type", "")
|
|
116
|
+
after_style_sep = ws.get("after_style_sep", default_sep)
|
|
117
|
+
before_close = ws.get("before_close", "")
|
|
118
|
+
|
|
119
|
+
result = constants.CHAR_OPEN + before_name + name + after_name
|
|
120
|
+
|
|
121
|
+
if node_type:
|
|
122
|
+
result += constants.CHAR_SEP + after_type_sep + node_type + after_type
|
|
123
|
+
|
|
124
|
+
# Format string (e.g. {n, number, ::currency/USD})
|
|
125
|
+
if "format" in node:
|
|
126
|
+
result += constants.CHAR_SEP + after_style_sep + node["format"]
|
|
127
|
+
result += before_close + constants.CHAR_CLOSE
|
|
128
|
+
return result
|
|
129
|
+
|
|
130
|
+
# Submessages (plural, select, selectordinal)
|
|
131
|
+
if "options" in node:
|
|
132
|
+
result += constants.CHAR_SEP + after_style_sep
|
|
133
|
+
|
|
134
|
+
# Offset for plural/selectordinal
|
|
135
|
+
offset = node.get("offset", 0)
|
|
136
|
+
if offset:
|
|
137
|
+
result += f"offset:{offset} "
|
|
138
|
+
|
|
139
|
+
options = node["options"]
|
|
140
|
+
options_ws = options.get("_ws", {}) if isinstance(options.get("_ws"), dict) else {}
|
|
141
|
+
|
|
142
|
+
child_in_plural = node_type in ("plural", "selectordinal")
|
|
143
|
+
|
|
144
|
+
first = True
|
|
145
|
+
for key, value in options.items():
|
|
146
|
+
if key == "_ws":
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
selector_ws = options_ws.get(key, {})
|
|
150
|
+
before_sel = selector_ws.get("before_selector", "" if first else " ")
|
|
151
|
+
after_sel = selector_ws.get("after_selector", "")
|
|
152
|
+
|
|
153
|
+
if not first:
|
|
154
|
+
result += before_sel
|
|
155
|
+
else:
|
|
156
|
+
first = False
|
|
157
|
+
|
|
158
|
+
result += key + after_sel
|
|
159
|
+
result += constants.CHAR_OPEN
|
|
160
|
+
if isinstance(value, list):
|
|
161
|
+
result += _print_nodes(value, is_in_plural=child_in_plural)
|
|
162
|
+
result += constants.CHAR_CLOSE
|
|
163
|
+
|
|
164
|
+
result += before_close + constants.CHAR_CLOSE
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
result += before_close + constants.CHAR_CLOSE
|
|
168
|
+
return result
|