sembr 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sembr-0.2.2 → sembr-0.2.4}/PKG-INFO +48 -9
- sembr-0.2.2/sembr.egg-info/PKG-INFO → sembr-0.2.4/README.md +37 -28
- {sembr-0.2.2 → sembr-0.2.4}/pyproject.toml +11 -7
- {sembr-0.2.2 → sembr-0.2.4}/sembr/__init__.py +1 -1
- {sembr-0.2.2 → sembr-0.2.4}/sembr/cli.py +51 -16
- {sembr-0.2.2 → sembr-0.2.4}/sembr/inference.py +1 -1
- {sembr-0.2.2 → sembr-0.2.4}/sembr/mcp.py +27 -9
- sembr-0.2.4/sembr/processors/__init__.py +15 -0
- sembr-0.2.4/sembr/processors/base.py +294 -0
- sembr-0.2.4/sembr/processors/latex.py +384 -0
- sembr-0.2.4/sembr/processors/markdown.py +298 -0
- sembr-0.2.4/sembr/processors/plaintext.py +241 -0
- sembr-0.2.4/sembr/processors/utils.py +70 -0
- sembr-0.2.2/README.md → sembr-0.2.4/sembr.egg-info/PKG-INFO +67 -2
- {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/SOURCES.txt +7 -2
- sembr-0.2.4/sembr.egg-info/requires.txt +13 -0
- sembr-0.2.2/sembr/process.py +0 -302
- sembr-0.2.2/sembr.egg-info/requires.txt +0 -9
- {sembr-0.2.2 → sembr-0.2.4}/LICENSE +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr/databuilder.py +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr/dataset.py +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr/eval.py +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr/train.py +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr/utils.py +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/dependency_links.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/entry_points.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/top_level.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sembr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A semantic linebreaker powered by transformers
|
|
5
5
|
Author: admk
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,17 +14,21 @@ Requires-Python: >=3.10
|
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
16
|
Requires-Dist: accelerate
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
17
|
+
Requires-Dist: fastmcp
|
|
18
|
+
Requires-Dist: flask
|
|
19
|
+
Requires-Dist: magika
|
|
20
|
+
Requires-Dist: mcp[cli]
|
|
19
21
|
Requires-Dist: numpy
|
|
20
|
-
Requires-Dist:
|
|
22
|
+
Requires-Dist: pydantic
|
|
21
23
|
Requires-Dist: requests
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: torch
|
|
25
|
+
Requires-Dist: tqdm
|
|
26
|
+
Requires-Dist: transformers
|
|
27
|
+
Requires-Dist: tree-sitter>=0.25.0
|
|
28
|
+
Requires-Dist: tree-sitter-markdown>=0.3.2
|
|
25
29
|
Dynamic: license-file
|
|
26
30
|
|
|
27
|
-
# Semantic Line Breaker (SemBr)
|
|
31
|
+
# ⚡️ Semantic Line Breaker (SemBr)
|
|
28
32
|
|
|
29
33
|
[](LICENSE)
|
|
30
34
|
[](https://www.python.org)
|
|
@@ -45,6 +49,9 @@ SemBr is a command-line tool
|
|
|
45
49
|
powered by [Transformer][transformers1] [models][transformers2]
|
|
46
50
|
that performs [semantic linebreaks](#what-are-semantic-line-breaks)
|
|
47
51
|
to breaks lines in a text file at semantic boundaries.
|
|
52
|
+
It supports multiple file types
|
|
53
|
+
including LaTeX, Markdown, and plain text,
|
|
54
|
+
with automatic file type detection.
|
|
48
55
|
|
|
49
56
|
### Installation
|
|
50
57
|
|
|
@@ -66,6 +73,34 @@ sembr # run
|
|
|
66
73
|
uvx sembr # install and run directly
|
|
67
74
|
```
|
|
68
75
|
|
|
76
|
+
#### From GitHub (Latest Development Version)
|
|
77
|
+
|
|
78
|
+
To install the latest development version directly from GitHub:
|
|
79
|
+
|
|
80
|
+
```shell
|
|
81
|
+
# Install from GitHub main branch
|
|
82
|
+
uv tool install git+https://github.com/admk/sembr.git
|
|
83
|
+
|
|
84
|
+
# Run directly without installing
|
|
85
|
+
uvx --from git+https://github.com/admk/sembr.git sembr
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Alternatively, clone and install in development mode:
|
|
89
|
+
|
|
90
|
+
```shell
|
|
91
|
+
# Clone the repository
|
|
92
|
+
git clone https://github.com/admk/sembr.git
|
|
93
|
+
cd sembr
|
|
94
|
+
|
|
95
|
+
# Install in development mode
|
|
96
|
+
pip install -e .
|
|
97
|
+
|
|
98
|
+
# Or with uv
|
|
99
|
+
uv pip install -e .
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Note that the development version may include experimental features and could be less stable than the PyPI release.
|
|
103
|
+
|
|
69
104
|
### Supported Platforms
|
|
70
105
|
|
|
71
106
|
SemBr is supported on Linux, Mac and Windows.
|
|
@@ -155,6 +190,9 @@ to customize the behavior of SemBr:
|
|
|
155
190
|
* `--dtype <dtype>`:
|
|
156
191
|
Data type for model weights (e.g. `float16`, `bfloat16`).
|
|
157
192
|
Default is `float32`.
|
|
193
|
+
* `--file-type <type>`:
|
|
194
|
+
File type (`plaintext`, `latex`, `markdown`, etc.).
|
|
195
|
+
Auto-detected using [Magika][magika] if not provided.
|
|
158
196
|
* `--mcp`:
|
|
159
197
|
Start MCP server mode instead of processing text.
|
|
160
198
|
|
|
@@ -357,7 +395,7 @@ to save best models.
|
|
|
357
395
|
- Natural language support:
|
|
358
396
|
- [ ] Support natural languages other than English.
|
|
359
397
|
- Typesetting languages support:
|
|
360
|
-
- [
|
|
398
|
+
- [x] ~~Markdown.~~
|
|
361
399
|
- [ ] Typst.
|
|
362
400
|
- Usability:
|
|
363
401
|
- [ ] Inference queue.
|
|
@@ -405,6 +443,7 @@ Semantic line breaking:
|
|
|
405
443
|
[pypi]: https://pypi.org/project/sembr
|
|
406
444
|
[uv]: https://github.com/astral-sh/uv
|
|
407
445
|
[mcp]: https://modelcontextprotocol.io/overview
|
|
446
|
+
[magika]: https://github.com/google/magika
|
|
408
447
|
|
|
409
448
|
[sembr]: https://sembr.org
|
|
410
449
|
[semlf]: https://rhodesmill.org/brandon/2012/one-sentence-per-line
|
|
@@ -1,30 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
Name: sembr
|
|
3
|
-
Version: 0.2.2
|
|
4
|
-
Summary: A semantic linebreaker powered by transformers
|
|
5
|
-
Author: admk
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/admk/sembr
|
|
8
|
-
Project-URL: Issues, https://github.com/admk/sembr/issues
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: Operating System :: OS Independent
|
|
11
|
-
Classifier: Topic :: Utilities
|
|
12
|
-
Classifier: Environment :: Console
|
|
13
|
-
Requires-Python: >=3.10
|
|
14
|
-
Description-Content-Type: text/markdown
|
|
15
|
-
License-File: LICENSE
|
|
16
|
-
Requires-Dist: accelerate
|
|
17
|
-
Requires-Dist: transformers
|
|
18
|
-
Requires-Dist: torch
|
|
19
|
-
Requires-Dist: numpy
|
|
20
|
-
Requires-Dist: tqdm
|
|
21
|
-
Requires-Dist: requests
|
|
22
|
-
Requires-Dist: flask
|
|
23
|
-
Requires-Dist: mcp[cli]>=1.2.0
|
|
24
|
-
Requires-Dist: fastmcp>=2.10.6
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
|
|
27
|
-
# Semantic Line Breaker (SemBr)
|
|
1
|
+
# ⚡️ Semantic Line Breaker (SemBr)
|
|
28
2
|
|
|
29
3
|
[](LICENSE)
|
|
30
4
|
[](https://www.python.org)
|
|
@@ -45,6 +19,9 @@ SemBr is a command-line tool
|
|
|
45
19
|
powered by [Transformer][transformers1] [models][transformers2]
|
|
46
20
|
that performs [semantic linebreaks](#what-are-semantic-line-breaks)
|
|
47
21
|
to breaks lines in a text file at semantic boundaries.
|
|
22
|
+
It supports multiple file types
|
|
23
|
+
including LaTeX, Markdown, and plain text,
|
|
24
|
+
with automatic file type detection.
|
|
48
25
|
|
|
49
26
|
### Installation
|
|
50
27
|
|
|
@@ -66,6 +43,34 @@ sembr # run
|
|
|
66
43
|
uvx sembr # install and run directly
|
|
67
44
|
```
|
|
68
45
|
|
|
46
|
+
#### From GitHub (Latest Development Version)
|
|
47
|
+
|
|
48
|
+
To install the latest development version directly from GitHub:
|
|
49
|
+
|
|
50
|
+
```shell
|
|
51
|
+
# Install from GitHub main branch
|
|
52
|
+
uv tool install git+https://github.com/admk/sembr.git
|
|
53
|
+
|
|
54
|
+
# Run directly without installing
|
|
55
|
+
uvx --from git+https://github.com/admk/sembr.git sembr
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Alternatively, clone and install in development mode:
|
|
59
|
+
|
|
60
|
+
```shell
|
|
61
|
+
# Clone the repository
|
|
62
|
+
git clone https://github.com/admk/sembr.git
|
|
63
|
+
cd sembr
|
|
64
|
+
|
|
65
|
+
# Install in development mode
|
|
66
|
+
pip install -e .
|
|
67
|
+
|
|
68
|
+
# Or with uv
|
|
69
|
+
uv pip install -e .
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Note that the development version may include experimental features and could be less stable than the PyPI release.
|
|
73
|
+
|
|
69
74
|
### Supported Platforms
|
|
70
75
|
|
|
71
76
|
SemBr is supported on Linux, Mac and Windows.
|
|
@@ -155,6 +160,9 @@ to customize the behavior of SemBr:
|
|
|
155
160
|
* `--dtype <dtype>`:
|
|
156
161
|
Data type for model weights (e.g. `float16`, `bfloat16`).
|
|
157
162
|
Default is `float32`.
|
|
163
|
+
* `--file-type <type>`:
|
|
164
|
+
File type (`plaintext`, `latex`, `markdown`, etc.).
|
|
165
|
+
Auto-detected using [Magika][magika] if not provided.
|
|
158
166
|
* `--mcp`:
|
|
159
167
|
Start MCP server mode instead of processing text.
|
|
160
168
|
|
|
@@ -357,7 +365,7 @@ to save best models.
|
|
|
357
365
|
- Natural language support:
|
|
358
366
|
- [ ] Support natural languages other than English.
|
|
359
367
|
- Typesetting languages support:
|
|
360
|
-
- [
|
|
368
|
+
- [x] ~~Markdown.~~
|
|
361
369
|
- [ ] Typst.
|
|
362
370
|
- Usability:
|
|
363
371
|
- [ ] Inference queue.
|
|
@@ -405,6 +413,7 @@ Semantic line breaking:
|
|
|
405
413
|
[pypi]: https://pypi.org/project/sembr
|
|
406
414
|
[uv]: https://github.com/astral-sh/uv
|
|
407
415
|
[mcp]: https://modelcontextprotocol.io/overview
|
|
416
|
+
[magika]: https://github.com/google/magika
|
|
408
417
|
|
|
409
418
|
[sembr]: https://sembr.org
|
|
410
419
|
[semlf]: https://rhodesmill.org/brandon/2012/one-sentence-per-line
|
|
@@ -15,14 +15,18 @@ license-files = ["LICEN[CS]E*"]
|
|
|
15
15
|
requires-python = ">=3.10"
|
|
16
16
|
dependencies = [
|
|
17
17
|
"accelerate",
|
|
18
|
-
"
|
|
19
|
-
"
|
|
18
|
+
"fastmcp",
|
|
19
|
+
"flask",
|
|
20
|
+
"magika",
|
|
21
|
+
"mcp[cli]",
|
|
20
22
|
"numpy",
|
|
21
|
-
"
|
|
23
|
+
"pydantic",
|
|
22
24
|
"requests",
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
25
|
+
"torch",
|
|
26
|
+
"tqdm",
|
|
27
|
+
"transformers",
|
|
28
|
+
"tree-sitter>=0.25.0",
|
|
29
|
+
"tree-sitter-markdown>=0.3.2",
|
|
26
30
|
]
|
|
27
31
|
classifiers = [
|
|
28
32
|
"Programming Language :: Python :: 3",
|
|
@@ -39,7 +43,7 @@ Homepage = "https://github.com/admk/sembr"
|
|
|
39
43
|
Issues = "https://github.com/admk/sembr/issues"
|
|
40
44
|
|
|
41
45
|
[tool.setuptools.packages.find]
|
|
42
|
-
include = ["sembr"]
|
|
46
|
+
include = ["sembr", "sembr.*"]
|
|
43
47
|
exclude = ["data*", "tests*"]
|
|
44
48
|
|
|
45
49
|
[tool.setuptools.dynamic]
|
|
@@ -18,7 +18,9 @@ def cli_parser():
|
|
|
18
18
|
p = argparse.ArgumentParser(
|
|
19
19
|
description='SemBr: Rewrap text with semantic breaks.')
|
|
20
20
|
model_name = 'admko/sembr2023-bert-small'
|
|
21
|
-
p.add_argument('-
|
|
21
|
+
p.add_argument('-V', '--version', action='version', version=__version__)
|
|
22
|
+
p.add_argument(
|
|
23
|
+
'-v', '--verbose', action='store_true', help='Enable verbose output')
|
|
22
24
|
p.add_argument('-m', '--model-name', type=str, default=model_name)
|
|
23
25
|
p.add_argument('-i', '--input-file', type=str, default=None)
|
|
24
26
|
p.add_argument('-o', '--output-file', type=str, default=None)
|
|
@@ -35,15 +37,25 @@ def cli_parser():
|
|
|
35
37
|
p.add_argument('--dtype', type=str, default=None)
|
|
36
38
|
p.add_argument('--debug', action='store_true')
|
|
37
39
|
p.add_argument('--mcp', action='store_true', help='Start MCP server mode')
|
|
40
|
+
p.add_argument(
|
|
41
|
+
'--file-type', type=str, default=None,
|
|
42
|
+
help=(
|
|
43
|
+
'File type (plaintext, latex, markdown, etc.). '
|
|
44
|
+
'Auto-detect if not provided. '
|
|
45
|
+
'File type must be provided if using stdin.'))
|
|
38
46
|
return p
|
|
39
47
|
|
|
40
48
|
|
|
41
|
-
def init(model_name, bits=None, dtype=None):
|
|
49
|
+
def init(model_name, bits=None, dtype=None, file_type=None, file_path=None, text=None, verbose=False):
|
|
42
50
|
import torch
|
|
43
|
-
from transformers import
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
51
|
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
52
|
+
from .processors import get_processor
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
56
|
+
except Exception:
|
|
57
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
|
|
58
|
+
|
|
47
59
|
dtype = getattr(torch, dtype) if dtype is not None else torch.float32
|
|
48
60
|
kwargs = {}
|
|
49
61
|
if torch.cuda.is_available():
|
|
@@ -59,20 +71,29 @@ def init(model_name, bits=None, dtype=None):
|
|
|
59
71
|
if bits in [4, 8]:
|
|
60
72
|
raise RuntimeError('MPS does not support quantization.')
|
|
61
73
|
kwargs['device_map'] = 'mps'
|
|
62
|
-
|
|
63
|
-
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
model = AutoModelForTokenClassification.from_pretrained(
|
|
77
|
+
model_name, torch_dtype=dtype, **kwargs)
|
|
78
|
+
except Exception:
|
|
79
|
+
model = AutoModelForTokenClassification.from_pretrained(
|
|
80
|
+
model_name, torch_dtype=dtype, local_files_only=True, **kwargs)
|
|
81
|
+
|
|
64
82
|
model.eval()
|
|
65
|
-
processor =
|
|
83
|
+
processor = get_processor(
|
|
84
|
+
file_type=file_type, file_path=file_path, text=text, verbose=verbose)
|
|
66
85
|
return tokenizer, model, processor
|
|
67
86
|
|
|
68
87
|
|
|
69
|
-
def start_server(
|
|
88
|
+
def start_server(
|
|
89
|
+
port, tokenizer, model, default_file_type=None, wrap_kwargs=None
|
|
90
|
+
):
|
|
70
91
|
from flask import Flask, request
|
|
92
|
+
from .processors import get_processor
|
|
71
93
|
app = Flask(__name__)
|
|
72
94
|
base_rv = {
|
|
73
95
|
'model': model.__class__.__name__,
|
|
74
96
|
'tokenizer': tokenizer.__class__.__name__,
|
|
75
|
-
'processor': processor.__class__.__name__,
|
|
76
97
|
}
|
|
77
98
|
|
|
78
99
|
@app.route('/check')
|
|
@@ -88,8 +109,17 @@ def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
|
|
|
88
109
|
form = request.form
|
|
89
110
|
text = form['text']
|
|
90
111
|
kwargs = dict(wrap_kwargs or {})
|
|
112
|
+
|
|
113
|
+
# Get file_type from form data or use default
|
|
114
|
+
file_type = form.get('file_type', default_file_type)
|
|
115
|
+
|
|
116
|
+
# Create processor dynamically based on file type or text content
|
|
117
|
+
processor = get_processor(
|
|
118
|
+
file_type=file_type, text=text if not file_type else None)
|
|
119
|
+
|
|
120
|
+
# Process other form parameters
|
|
91
121
|
for k, v in form.items():
|
|
92
|
-
if k
|
|
122
|
+
if k in ['text', 'file_type']:
|
|
93
123
|
continue
|
|
94
124
|
if k in ['batch_size', 'tokens_per_line', 'overlap_divisor']:
|
|
95
125
|
v = int(v)
|
|
@@ -99,6 +129,8 @@ def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
|
|
|
99
129
|
return {
|
|
100
130
|
'status': 'success',
|
|
101
131
|
**base_rv,
|
|
132
|
+
'processor': processor.__class__.__name__,
|
|
133
|
+
'file_type': file_type,
|
|
102
134
|
**kwargs,
|
|
103
135
|
'text': results,
|
|
104
136
|
}
|
|
@@ -106,6 +138,8 @@ def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
|
|
|
106
138
|
return {
|
|
107
139
|
'status': 'error',
|
|
108
140
|
**base_rv,
|
|
141
|
+
'processor': processor.__class__.__name__,
|
|
142
|
+
'file_type': file_type,
|
|
109
143
|
**kwargs,
|
|
110
144
|
'error': str(e),
|
|
111
145
|
'traceback': traceback.format_exc(),
|
|
@@ -180,9 +214,9 @@ def main() -> int:
|
|
|
180
214
|
return 0
|
|
181
215
|
kwargs = wrap_kwargs(args)
|
|
182
216
|
if args.listen:
|
|
183
|
-
tokenizer, model,
|
|
184
|
-
args.model_name, args.bits, args.dtype)
|
|
185
|
-
start_server(args.port, tokenizer, model,
|
|
217
|
+
tokenizer, model, _ = init(
|
|
218
|
+
args.model_name, args.bits, args.dtype, args.file_type, None, None, args.verbose)
|
|
219
|
+
start_server(args.port, tokenizer, model, args.file_type, kwargs)
|
|
186
220
|
return 0
|
|
187
221
|
if args.input_file is not None:
|
|
188
222
|
with open(args.input_file, 'r', encoding='utf-8') as f:
|
|
@@ -198,7 +232,8 @@ def main() -> int:
|
|
|
198
232
|
else:
|
|
199
233
|
from .inference import sembr
|
|
200
234
|
tokenizer, model, processor = init(
|
|
201
|
-
args.model_name, args.bits, args.dtype
|
|
235
|
+
args.model_name, args.bits, args.dtype,
|
|
236
|
+
args.file_type, args.input_file, text, args.verbose)
|
|
202
237
|
result = sembr(text, tokenizer, model, processor, **kwargs)
|
|
203
238
|
if args.output_file is None:
|
|
204
239
|
print(result)
|
|
@@ -108,7 +108,7 @@ def inference(
|
|
|
108
108
|
if text.strip() == '':
|
|
109
109
|
return []
|
|
110
110
|
collator = DataCollatorForTokenClassification(tokenizer, padding='longest')
|
|
111
|
-
results = processor(text, split=isinstance(text, str))
|
|
111
|
+
results = processor.parse_text(text, split=isinstance(text, str))
|
|
112
112
|
results = processor.tokenize_with_modes(tokenizer, results)
|
|
113
113
|
logits, counts = _tiled_inference(
|
|
114
114
|
model, collator, results, batch_size, overlap_divisor)
|
|
@@ -9,16 +9,25 @@ from .cli import init, cli_parser, wrap_kwargs
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class SembrModel:
|
|
12
|
-
def __init__(self, tokenizer, model,
|
|
12
|
+
def __init__(self, tokenizer, model, default_file_type=None, kwargs=None):
|
|
13
13
|
self.tokenizer = tokenizer
|
|
14
14
|
self.model = model
|
|
15
|
-
self.
|
|
16
|
-
self.kwargs = kwargs
|
|
15
|
+
self.default_file_type = default_file_type
|
|
16
|
+
self.kwargs = kwargs or {}
|
|
17
17
|
|
|
18
|
-
def process_text(self, text: str) -> str:
|
|
18
|
+
def process_text(self, text: str, file_type: Optional[str] = None) -> str:
|
|
19
19
|
from .inference import sembr
|
|
20
|
+
from .processors import get_processor
|
|
21
|
+
|
|
22
|
+
# Use provided file_type, default, or auto-detect from text
|
|
23
|
+
effective_file_type = file_type or self.default_file_type
|
|
24
|
+
processor = get_processor(
|
|
25
|
+
file_type=effective_file_type,
|
|
26
|
+
text=text if not effective_file_type else None
|
|
27
|
+
)
|
|
28
|
+
|
|
20
29
|
return sembr(
|
|
21
|
-
text, self.tokenizer, self.model,
|
|
30
|
+
text, self.tokenizer, self.model, processor, **self.kwargs)
|
|
22
31
|
|
|
23
32
|
|
|
24
33
|
_sembr_model: Optional[SembrModel] = None
|
|
@@ -31,9 +40,10 @@ def get_sembr_model() -> SembrModel:
|
|
|
31
40
|
return _sembr_model
|
|
32
41
|
parser = cli_parser()
|
|
33
42
|
args, _ = parser.parse_known_args()
|
|
34
|
-
tokenizer, model,
|
|
43
|
+
tokenizer, model, _ = init(
|
|
44
|
+
args.model_name, args.bits, args.dtype, args.file_type)
|
|
35
45
|
kwargs = wrap_kwargs(args)
|
|
36
|
-
_sembr_model = SembrModel(tokenizer, model,
|
|
46
|
+
_sembr_model = SembrModel(tokenizer, model, args.file_type, kwargs)
|
|
37
47
|
return _sembr_model
|
|
38
48
|
|
|
39
49
|
|
|
@@ -46,18 +56,26 @@ mcp = FastMCP("SemBr")
|
|
|
46
56
|
)
|
|
47
57
|
def wrap_text(
|
|
48
58
|
text: Annotated[str, Field(description="Text to wrap")],
|
|
59
|
+
file_type: Annotated[Optional[str], Field(
|
|
60
|
+
description=(
|
|
61
|
+
"File type (latex, markdown, plaintext, etc.). "
|
|
62
|
+
"Auto-detect if not provided."),
|
|
63
|
+
default=None
|
|
64
|
+
)] = None,
|
|
49
65
|
) -> ToolResult:
|
|
50
66
|
try:
|
|
51
|
-
wrapped_text = get_sembr_model().process_text(text)
|
|
67
|
+
wrapped_text = get_sembr_model().process_text(text, file_type)
|
|
52
68
|
except Exception as e:
|
|
53
69
|
return ToolResult(
|
|
54
70
|
content=[TextContent(type="text", text=f"Error processing text: {str(e)}")],
|
|
55
71
|
structured_content={"success": False, "error": str(e)})
|
|
56
72
|
num_lines = len(wrapped_text.splitlines())
|
|
57
73
|
readable = f"Performed semantic line breaks to {num_lines} lines."
|
|
74
|
+
if file_type:
|
|
75
|
+
readable += f" File type: {file_type}."
|
|
58
76
|
return ToolResult(
|
|
59
77
|
content=[TextContent(type="text", text=readable)],
|
|
60
|
-
structured_content={"success": True, "output": wrapped_text})
|
|
78
|
+
structured_content={"success": True, "output": wrapped_text, "file_type": file_type})
|
|
61
79
|
|
|
62
80
|
|
|
63
81
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Grammar-based text processors for different file types.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .base import BaseProcessor
|
|
6
|
+
from .latex import LaTeXProcessor
|
|
7
|
+
from .markdown import MarkdownProcessor
|
|
8
|
+
from .plaintext import PlainTextProcessor
|
|
9
|
+
from .utils import get_processor, detect_file_type_from_text
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
'BaseProcessor',
|
|
14
|
+
'LaTeXProcessor', 'MarkdownProcessor', 'PlainTextProcessor',
|
|
15
|
+
'get_processor', 'detect_file_type_from_text']
|