sembr 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {sembr-0.2.2 → sembr-0.2.4}/PKG-INFO +48 -9
  2. sembr-0.2.2/sembr.egg-info/PKG-INFO → sembr-0.2.4/README.md +37 -28
  3. {sembr-0.2.2 → sembr-0.2.4}/pyproject.toml +11 -7
  4. {sembr-0.2.2 → sembr-0.2.4}/sembr/__init__.py +1 -1
  5. {sembr-0.2.2 → sembr-0.2.4}/sembr/cli.py +51 -16
  6. {sembr-0.2.2 → sembr-0.2.4}/sembr/inference.py +1 -1
  7. {sembr-0.2.2 → sembr-0.2.4}/sembr/mcp.py +27 -9
  8. sembr-0.2.4/sembr/processors/__init__.py +15 -0
  9. sembr-0.2.4/sembr/processors/base.py +294 -0
  10. sembr-0.2.4/sembr/processors/latex.py +384 -0
  11. sembr-0.2.4/sembr/processors/markdown.py +298 -0
  12. sembr-0.2.4/sembr/processors/plaintext.py +241 -0
  13. sembr-0.2.4/sembr/processors/utils.py +70 -0
  14. sembr-0.2.2/README.md → sembr-0.2.4/sembr.egg-info/PKG-INFO +67 -2
  15. {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/SOURCES.txt +7 -2
  16. sembr-0.2.4/sembr.egg-info/requires.txt +13 -0
  17. sembr-0.2.2/sembr/process.py +0 -302
  18. sembr-0.2.2/sembr.egg-info/requires.txt +0 -9
  19. {sembr-0.2.2 → sembr-0.2.4}/LICENSE +0 -0
  20. {sembr-0.2.2 → sembr-0.2.4}/sembr/databuilder.py +0 -0
  21. {sembr-0.2.2 → sembr-0.2.4}/sembr/dataset.py +0 -0
  22. {sembr-0.2.2 → sembr-0.2.4}/sembr/eval.py +0 -0
  23. {sembr-0.2.2 → sembr-0.2.4}/sembr/train.py +0 -0
  24. {sembr-0.2.2 → sembr-0.2.4}/sembr/utils.py +0 -0
  25. {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/dependency_links.txt +0 -0
  26. {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/entry_points.txt +0 -0
  27. {sembr-0.2.2 → sembr-0.2.4}/sembr.egg-info/top_level.txt +0 -0
  28. {sembr-0.2.2 → sembr-0.2.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sembr
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: A semantic linebreaker powered by transformers
5
5
  Author: admk
6
6
  License-Expression: MIT
@@ -14,17 +14,21 @@ Requires-Python: >=3.10
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
16
  Requires-Dist: accelerate
17
- Requires-Dist: transformers
18
- Requires-Dist: torch
17
+ Requires-Dist: fastmcp
18
+ Requires-Dist: flask
19
+ Requires-Dist: magika
20
+ Requires-Dist: mcp[cli]
19
21
  Requires-Dist: numpy
20
- Requires-Dist: tqdm
22
+ Requires-Dist: pydantic
21
23
  Requires-Dist: requests
22
- Requires-Dist: flask
23
- Requires-Dist: mcp[cli]>=1.2.0
24
- Requires-Dist: fastmcp>=2.10.6
24
+ Requires-Dist: torch
25
+ Requires-Dist: tqdm
26
+ Requires-Dist: transformers
27
+ Requires-Dist: tree-sitter>=0.25.0
28
+ Requires-Dist: tree-sitter-markdown>=0.3.2
25
29
  Dynamic: license-file
26
30
 
27
- # Semantic Line Breaker (SemBr)
31
+ # ⚡️ Semantic Line Breaker (SemBr)
28
32
 
29
33
  [![GitHub](https://img.shields.io/github/license/admk/sembr)](LICENSE)
30
34
  [![python](https://img.shields.io/badge/Python-3.10-3776AB.svg?style=flat&logo=python&logoColor=white)](https://www.python.org)
@@ -45,6 +49,9 @@ SemBr is a command-line tool
45
49
  powered by [Transformer][transformers1] [models][transformers2]
46
50
  that performs [semantic linebreaks](#what-are-semantic-line-breaks)
47
51
  to breaks lines in a text file at semantic boundaries.
52
+ It supports multiple file types
53
+ including LaTeX, Markdown, and plain text,
54
+ with automatic file type detection.
48
55
 
49
56
  ### Installation
50
57
 
@@ -66,6 +73,34 @@ sembr # run
66
73
  uvx sembr # install and run directly
67
74
  ```
68
75
 
76
+ #### From GitHub (Latest Development Version)
77
+
78
+ To install the latest development version directly from GitHub:
79
+
80
+ ```shell
81
+ # Install from GitHub main branch
82
+ uv tool install git+https://github.com/admk/sembr.git
83
+
84
+ # Run directly without installing
85
+ uvx --from git+https://github.com/admk/sembr.git sembr
86
+ ```
87
+
88
+ Alternatively, clone and install in development mode:
89
+
90
+ ```shell
91
+ # Clone the repository
92
+ git clone https://github.com/admk/sembr.git
93
+ cd sembr
94
+
95
+ # Install in development mode
96
+ pip install -e .
97
+
98
+ # Or with uv
99
+ uv pip install -e .
100
+ ```
101
+
102
+ Note that the development version may include experimental features and could be less stable than the PyPI release.
103
+
69
104
  ### Supported Platforms
70
105
 
71
106
  SemBr is supported on Linux, Mac and Windows.
@@ -155,6 +190,9 @@ to customize the behavior of SemBr:
155
190
  * `--dtype <dtype>`:
156
191
  Data type for model weights (e.g. `float16`, `bfloat16`).
157
192
  Default is `float32`.
193
+ * `--file-type <type>`:
194
+ File type (`plaintext`, `latex`, `markdown`, etc.).
195
+ Auto-detected using [Magika][magika] if not provided.
158
196
  * `--mcp`:
159
197
  Start MCP server mode instead of processing text.
160
198
 
@@ -357,7 +395,7 @@ to save best models.
357
395
  - Natural language support:
358
396
  - [ ] Support natural languages other than English.
359
397
  - Typesetting languages support:
360
- - [ ] Markdown.
398
+ - [x] ~~Markdown.~~
361
399
  - [ ] Typst.
362
400
  - Usability:
363
401
  - [ ] Inference queue.
@@ -405,6 +443,7 @@ Semantic line breaking:
405
443
  [pypi]: https://pypi.org/project/sembr
406
444
  [uv]: https://github.com/astral-sh/uv
407
445
  [mcp]: https://modelcontextprotocol.io/overview
446
+ [magika]: https://github.com/google/magika
408
447
 
409
448
  [sembr]: https://sembr.org
410
449
  [semlf]: https://rhodesmill.org/brandon/2012/one-sentence-per-line
@@ -1,30 +1,4 @@
1
- Metadata-Version: 2.4
2
- Name: sembr
3
- Version: 0.2.2
4
- Summary: A semantic linebreaker powered by transformers
5
- Author: admk
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/admk/sembr
8
- Project-URL: Issues, https://github.com/admk/sembr/issues
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Topic :: Utilities
12
- Classifier: Environment :: Console
13
- Requires-Python: >=3.10
14
- Description-Content-Type: text/markdown
15
- License-File: LICENSE
16
- Requires-Dist: accelerate
17
- Requires-Dist: transformers
18
- Requires-Dist: torch
19
- Requires-Dist: numpy
20
- Requires-Dist: tqdm
21
- Requires-Dist: requests
22
- Requires-Dist: flask
23
- Requires-Dist: mcp[cli]>=1.2.0
24
- Requires-Dist: fastmcp>=2.10.6
25
- Dynamic: license-file
26
-
27
- # Semantic Line Breaker (SemBr)
1
+ # ⚡️ Semantic Line Breaker (SemBr)
28
2
 
29
3
  [![GitHub](https://img.shields.io/github/license/admk/sembr)](LICENSE)
30
4
  [![python](https://img.shields.io/badge/Python-3.10-3776AB.svg?style=flat&logo=python&logoColor=white)](https://www.python.org)
@@ -45,6 +19,9 @@ SemBr is a command-line tool
45
19
  powered by [Transformer][transformers1] [models][transformers2]
46
20
  that performs [semantic linebreaks](#what-are-semantic-line-breaks)
47
21
  to breaks lines in a text file at semantic boundaries.
22
+ It supports multiple file types
23
+ including LaTeX, Markdown, and plain text,
24
+ with automatic file type detection.
48
25
 
49
26
  ### Installation
50
27
 
@@ -66,6 +43,34 @@ sembr # run
66
43
  uvx sembr # install and run directly
67
44
  ```
68
45
 
46
+ #### From GitHub (Latest Development Version)
47
+
48
+ To install the latest development version directly from GitHub:
49
+
50
+ ```shell
51
+ # Install from GitHub main branch
52
+ uv tool install git+https://github.com/admk/sembr.git
53
+
54
+ # Run directly without installing
55
+ uvx --from git+https://github.com/admk/sembr.git sembr
56
+ ```
57
+
58
+ Alternatively, clone and install in development mode:
59
+
60
+ ```shell
61
+ # Clone the repository
62
+ git clone https://github.com/admk/sembr.git
63
+ cd sembr
64
+
65
+ # Install in development mode
66
+ pip install -e .
67
+
68
+ # Or with uv
69
+ uv pip install -e .
70
+ ```
71
+
72
+ Note that the development version may include experimental features and could be less stable than the PyPI release.
73
+
69
74
  ### Supported Platforms
70
75
 
71
76
  SemBr is supported on Linux, Mac and Windows.
@@ -155,6 +160,9 @@ to customize the behavior of SemBr:
155
160
  * `--dtype <dtype>`:
156
161
  Data type for model weights (e.g. `float16`, `bfloat16`).
157
162
  Default is `float32`.
163
+ * `--file-type <type>`:
164
+ File type (`plaintext`, `latex`, `markdown`, etc.).
165
+ Auto-detected using [Magika][magika] if not provided.
158
166
  * `--mcp`:
159
167
  Start MCP server mode instead of processing text.
160
168
 
@@ -357,7 +365,7 @@ to save best models.
357
365
  - Natural language support:
358
366
  - [ ] Support natural languages other than English.
359
367
  - Typesetting languages support:
360
- - [ ] Markdown.
368
+ - [x] ~~Markdown.~~
361
369
  - [ ] Typst.
362
370
  - Usability:
363
371
  - [ ] Inference queue.
@@ -405,6 +413,7 @@ Semantic line breaking:
405
413
  [pypi]: https://pypi.org/project/sembr
406
414
  [uv]: https://github.com/astral-sh/uv
407
415
  [mcp]: https://modelcontextprotocol.io/overview
416
+ [magika]: https://github.com/google/magika
408
417
 
409
418
  [sembr]: https://sembr.org
410
419
  [semlf]: https://rhodesmill.org/brandon/2012/one-sentence-per-line
@@ -15,14 +15,18 @@ license-files = ["LICEN[CS]E*"]
15
15
  requires-python = ">=3.10"
16
16
  dependencies = [
17
17
  "accelerate",
18
- "transformers",
19
- "torch",
18
+ "fastmcp",
19
+ "flask",
20
+ "magika",
21
+ "mcp[cli]",
20
22
  "numpy",
21
- "tqdm",
23
+ "pydantic",
22
24
  "requests",
23
- "flask",
24
- "mcp[cli]>=1.2.0",
25
- "fastmcp>=2.10.6",
25
+ "torch",
26
+ "tqdm",
27
+ "transformers",
28
+ "tree-sitter>=0.25.0",
29
+ "tree-sitter-markdown>=0.3.2",
26
30
  ]
27
31
  classifiers = [
28
32
  "Programming Language :: Python :: 3",
@@ -39,7 +43,7 @@ Homepage = "https://github.com/admk/sembr"
39
43
  Issues = "https://github.com/admk/sembr/issues"
40
44
 
41
45
  [tool.setuptools.packages.find]
42
- include = ["sembr"]
46
+ include = ["sembr", "sembr.*"]
43
47
  exclude = ["data*", "tests*"]
44
48
 
45
49
  [tool.setuptools.dynamic]
@@ -1,5 +1,5 @@
1
1
  __toolname__ = __name__
2
- __version__ = "0.2.2"
2
+ __version__ = "0.2.4"
3
3
  __author__ = "admk"
4
4
  __license__ = "MIT"
5
5
  __url__ = f"https://github.com/admk/{__name__}"
@@ -18,7 +18,9 @@ def cli_parser():
18
18
  p = argparse.ArgumentParser(
19
19
  description='SemBr: Rewrap text with semantic breaks.')
20
20
  model_name = 'admko/sembr2023-bert-small'
21
- p.add_argument('-v', '--version', action='version', version=__version__)
21
+ p.add_argument('-V', '--version', action='version', version=__version__)
22
+ p.add_argument(
23
+ '-v', '--verbose', action='store_true', help='Enable verbose output')
22
24
  p.add_argument('-m', '--model-name', type=str, default=model_name)
23
25
  p.add_argument('-i', '--input-file', type=str, default=None)
24
26
  p.add_argument('-o', '--output-file', type=str, default=None)
@@ -35,15 +37,25 @@ def cli_parser():
35
37
  p.add_argument('--dtype', type=str, default=None)
36
38
  p.add_argument('--debug', action='store_true')
37
39
  p.add_argument('--mcp', action='store_true', help='Start MCP server mode')
40
+ p.add_argument(
41
+ '--file-type', type=str, default=None,
42
+ help=(
43
+ 'File type (plaintext, latex, markdown, etc.). '
44
+ 'Auto-detect if not provided. '
45
+ 'File type must be provided if using stdin.'))
38
46
  return p
39
47
 
40
48
 
41
- def init(model_name, bits=None, dtype=None):
49
+ def init(model_name, bits=None, dtype=None, file_type=None, file_path=None, text=None, verbose=False):
42
50
  import torch
43
- from transformers import (
44
- AutoTokenizer, AutoModelForTokenClassification)
45
- from .process import SemBrProcessor
46
- tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
52
+ from .processors import get_processor
53
+
54
+ try:
55
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
56
+ except Exception:
57
+ tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
58
+
47
59
  dtype = getattr(torch, dtype) if dtype is not None else torch.float32
48
60
  kwargs = {}
49
61
  if torch.cuda.is_available():
@@ -59,20 +71,29 @@ def init(model_name, bits=None, dtype=None):
59
71
  if bits in [4, 8]:
60
72
  raise RuntimeError('MPS does not support quantization.')
61
73
  kwargs['device_map'] = 'mps'
62
- model = AutoModelForTokenClassification.from_pretrained(
63
- model_name, torch_dtype=dtype, **kwargs)
74
+
75
+ try:
76
+ model = AutoModelForTokenClassification.from_pretrained(
77
+ model_name, torch_dtype=dtype, **kwargs)
78
+ except Exception:
79
+ model = AutoModelForTokenClassification.from_pretrained(
80
+ model_name, torch_dtype=dtype, local_files_only=True, **kwargs)
81
+
64
82
  model.eval()
65
- processor = SemBrProcessor()
83
+ processor = get_processor(
84
+ file_type=file_type, file_path=file_path, text=text, verbose=verbose)
66
85
  return tokenizer, model, processor
67
86
 
68
87
 
69
- def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
88
+ def start_server(
89
+ port, tokenizer, model, default_file_type=None, wrap_kwargs=None
90
+ ):
70
91
  from flask import Flask, request
92
+ from .processors import get_processor
71
93
  app = Flask(__name__)
72
94
  base_rv = {
73
95
  'model': model.__class__.__name__,
74
96
  'tokenizer': tokenizer.__class__.__name__,
75
- 'processor': processor.__class__.__name__,
76
97
  }
77
98
 
78
99
  @app.route('/check')
@@ -88,8 +109,17 @@ def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
88
109
  form = request.form
89
110
  text = form['text']
90
111
  kwargs = dict(wrap_kwargs or {})
112
+
113
+ # Get file_type from form data or use default
114
+ file_type = form.get('file_type', default_file_type)
115
+
116
+ # Create processor dynamically based on file type or text content
117
+ processor = get_processor(
118
+ file_type=file_type, text=text if not file_type else None)
119
+
120
+ # Process other form parameters
91
121
  for k, v in form.items():
92
- if k == 'text':
122
+ if k in ['text', 'file_type']:
93
123
  continue
94
124
  if k in ['batch_size', 'tokens_per_line', 'overlap_divisor']:
95
125
  v = int(v)
@@ -99,6 +129,8 @@ def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
99
129
  return {
100
130
  'status': 'success',
101
131
  **base_rv,
132
+ 'processor': processor.__class__.__name__,
133
+ 'file_type': file_type,
102
134
  **kwargs,
103
135
  'text': results,
104
136
  }
@@ -106,6 +138,8 @@ def start_server(port, tokenizer, model, processor, wrap_kwargs=None):
106
138
  return {
107
139
  'status': 'error',
108
140
  **base_rv,
141
+ 'processor': processor.__class__.__name__,
142
+ 'file_type': file_type,
109
143
  **kwargs,
110
144
  'error': str(e),
111
145
  'traceback': traceback.format_exc(),
@@ -180,9 +214,9 @@ def main() -> int:
180
214
  return 0
181
215
  kwargs = wrap_kwargs(args)
182
216
  if args.listen:
183
- tokenizer, model, processor = init(
184
- args.model_name, args.bits, args.dtype)
185
- start_server(args.port, tokenizer, model, processor, kwargs)
217
+ tokenizer, model, _ = init(
218
+ args.model_name, args.bits, args.dtype, args.file_type, None, None, args.verbose)
219
+ start_server(args.port, tokenizer, model, args.file_type, kwargs)
186
220
  return 0
187
221
  if args.input_file is not None:
188
222
  with open(args.input_file, 'r', encoding='utf-8') as f:
@@ -198,7 +232,8 @@ def main() -> int:
198
232
  else:
199
233
  from .inference import sembr
200
234
  tokenizer, model, processor = init(
201
- args.model_name, args.bits, args.dtype)
235
+ args.model_name, args.bits, args.dtype,
236
+ args.file_type, args.input_file, text, args.verbose)
202
237
  result = sembr(text, tokenizer, model, processor, **kwargs)
203
238
  if args.output_file is None:
204
239
  print(result)
@@ -108,7 +108,7 @@ def inference(
108
108
  if text.strip() == '':
109
109
  return []
110
110
  collator = DataCollatorForTokenClassification(tokenizer, padding='longest')
111
- results = processor(text, split=isinstance(text, str))
111
+ results = processor.parse_text(text, split=isinstance(text, str))
112
112
  results = processor.tokenize_with_modes(tokenizer, results)
113
113
  logits, counts = _tiled_inference(
114
114
  model, collator, results, batch_size, overlap_divisor)
@@ -9,16 +9,25 @@ from .cli import init, cli_parser, wrap_kwargs
9
9
 
10
10
 
11
11
  class SembrModel:
12
- def __init__(self, tokenizer, model, processor, kwargs):
12
+ def __init__(self, tokenizer, model, default_file_type=None, kwargs=None):
13
13
  self.tokenizer = tokenizer
14
14
  self.model = model
15
- self.processor = processor
16
- self.kwargs = kwargs
15
+ self.default_file_type = default_file_type
16
+ self.kwargs = kwargs or {}
17
17
 
18
- def process_text(self, text: str) -> str:
18
+ def process_text(self, text: str, file_type: Optional[str] = None) -> str:
19
19
  from .inference import sembr
20
+ from .processors import get_processor
21
+
22
+ # Use provided file_type, default, or auto-detect from text
23
+ effective_file_type = file_type or self.default_file_type
24
+ processor = get_processor(
25
+ file_type=effective_file_type,
26
+ text=text if not effective_file_type else None
27
+ )
28
+
20
29
  return sembr(
21
- text, self.tokenizer, self.model, self.processor, **self.kwargs)
30
+ text, self.tokenizer, self.model, processor, **self.kwargs)
22
31
 
23
32
 
24
33
  _sembr_model: Optional[SembrModel] = None
@@ -31,9 +40,10 @@ def get_sembr_model() -> SembrModel:
31
40
  return _sembr_model
32
41
  parser = cli_parser()
33
42
  args, _ = parser.parse_known_args()
34
- tokenizer, model, processor = init(args.model_name, args.bits, args.dtype)
43
+ tokenizer, model, _ = init(
44
+ args.model_name, args.bits, args.dtype, args.file_type)
35
45
  kwargs = wrap_kwargs(args)
36
- _sembr_model = SembrModel(tokenizer, model, processor, kwargs)
46
+ _sembr_model = SembrModel(tokenizer, model, args.file_type, kwargs)
37
47
  return _sembr_model
38
48
 
39
49
 
@@ -46,18 +56,26 @@ mcp = FastMCP("SemBr")
46
56
  )
47
57
  def wrap_text(
48
58
  text: Annotated[str, Field(description="Text to wrap")],
59
+ file_type: Annotated[Optional[str], Field(
60
+ description=(
61
+ "File type (latex, markdown, plaintext, etc.). "
62
+ "Auto-detect if not provided."),
63
+ default=None
64
+ )] = None,
49
65
  ) -> ToolResult:
50
66
  try:
51
- wrapped_text = get_sembr_model().process_text(text)
67
+ wrapped_text = get_sembr_model().process_text(text, file_type)
52
68
  except Exception as e:
53
69
  return ToolResult(
54
70
  content=[TextContent(type="text", text=f"Error processing text: {str(e)}")],
55
71
  structured_content={"success": False, "error": str(e)})
56
72
  num_lines = len(wrapped_text.splitlines())
57
73
  readable = f"Performed semantic line breaks to {num_lines} lines."
74
+ if file_type:
75
+ readable += f" File type: {file_type}."
58
76
  return ToolResult(
59
77
  content=[TextContent(type="text", text=readable)],
60
- structured_content={"success": True, "output": wrapped_text})
78
+ structured_content={"success": True, "output": wrapped_text, "file_type": file_type})
61
79
 
62
80
 
63
81
  if __name__ == "__main__":
@@ -0,0 +1,15 @@
1
+ """
2
+ Grammar-based text processors for different file types.
3
+ """
4
+
5
+ from .base import BaseProcessor
6
+ from .latex import LaTeXProcessor
7
+ from .markdown import MarkdownProcessor
8
+ from .plaintext import PlainTextProcessor
9
+ from .utils import get_processor, detect_file_type_from_text
10
+
11
+
12
+ __all__ = [
13
+ 'BaseProcessor',
14
+ 'LaTeXProcessor', 'MarkdownProcessor', 'PlainTextProcessor',
15
+ 'get_processor', 'detect_file_type_from_text']