sembr 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sembr
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: A semantic linebreaker powered by transformers
5
5
  Author: admk
6
6
  License-Expression: MIT
@@ -14,14 +14,15 @@ Requires-Python: >=3.10
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
16
  Requires-Dist: accelerate
17
- Requires-Dist: transformers
18
- Requires-Dist: torch
17
+ Requires-Dist: fastmcp
18
+ Requires-Dist: flask
19
+ Requires-Dist: mcp[cli]
19
20
  Requires-Dist: numpy
20
- Requires-Dist: tqdm
21
+ Requires-Dist: pydantic
21
22
  Requires-Dist: requests
22
- Requires-Dist: flask
23
- Requires-Dist: mcp[cli]>=1.2.0
24
- Requires-Dist: fastmcp>=2.10.6
23
+ Requires-Dist: torch
24
+ Requires-Dist: tqdm
25
+ Requires-Dist: transformers
25
26
  Dynamic: license-file
26
27
 
27
28
  # Semantic Line Breaker (SemBr)
@@ -15,14 +15,15 @@ license-files = ["LICEN[CS]E*"]
15
15
  requires-python = ">=3.10"
16
16
  dependencies = [
17
17
  "accelerate",
18
- "transformers",
19
- "torch",
18
+ "fastmcp",
19
+ "flask",
20
+ "mcp[cli]",
20
21
  "numpy",
21
- "tqdm",
22
+ "pydantic",
22
23
  "requests",
23
- "flask",
24
- "mcp[cli]>=1.2.0",
25
- "fastmcp>=2.10.6",
24
+ "torch",
25
+ "tqdm",
26
+ "transformers",
26
27
  ]
27
28
  classifiers = [
28
29
  "Programming Language :: Python :: 3",
@@ -1,5 +1,5 @@
1
1
  __toolname__ = __name__
2
- __version__ = "0.2.2"
2
+ __version__ = "0.2.3"
3
3
  __author__ = "admk"
4
4
  __license__ = "MIT"
5
5
  __url__ = f"https://github.com/admk/{__name__}"
@@ -108,7 +108,7 @@ def inference(
108
108
  if text.strip() == '':
109
109
  return []
110
110
  collator = DataCollatorForTokenClassification(tokenizer, padding='longest')
111
- results = processor(text, split=isinstance(text, str))
111
+ results = processor.parse_text(text, split=isinstance(text, str))
112
112
  results = processor.tokenize_with_modes(tokenizer, results)
113
113
  logits, counts = _tiled_inference(
114
114
  model, collator, results, batch_size, overlap_divisor)
@@ -143,6 +143,20 @@ class SemBrProcessor(object):
143
143
  'base_indent': base_indent,
144
144
  }
145
145
 
146
+ def parse_text(self, text, split=True):
147
+ text = text.replace('\t', ' ' * self.spaces)
148
+ if split:
149
+ text = re.split(r'\n(?:\s*\n)+', text)
150
+ elif isinstance(text, str):
151
+ raise ValueError(
152
+ 'Text must be a list of strings if split=True.')
153
+ paras = []
154
+ for p in text:
155
+ if not p.strip():
156
+ continue
157
+ paras.append(self._process_paragraph(p))
158
+ return paras
159
+
146
160
  def _tokenize_with_modes(
147
161
  self, tokenizer, text, line_modes, line_mode_offsets, line_indents
148
162
  ):
@@ -201,19 +215,6 @@ class SemBrProcessor(object):
201
215
  new_results.append(tokenized)
202
216
  return new_results
203
217
 
204
- def __call__(self, text, split=True):
205
- if split:
206
- text = re.split(r'\n(?:\s*\n)+', text)
207
- elif isinstance(text, str):
208
- raise ValueError(
209
- 'Text must be a list of strings if split=True.')
210
- paras = []
211
- for p in text:
212
- if not p.strip():
213
- continue
214
- paras.append(self._process_paragraph(p))
215
- return paras
216
-
217
218
  def _replace_newlines(self, words, modes, indents):
218
219
  new_words, new_modes, new_indents = [], [], []
219
220
  next_mode = None
@@ -288,11 +289,10 @@ class SemBrProcessor(object):
288
289
 
289
290
 
290
291
  if __name__ == '__main__':
291
- # test = open('./data/test/mair.tex', 'r').read()
292
- test = open('./data/example.tex', 'r').read()
292
+ test = open('./data/raw/example.tex', 'r').read()
293
293
  processor = SemBrProcessor()
294
294
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
295
- results = processor(test)
295
+ results = processor.parse_text(test)
296
296
  results = processor.tokenize_with_modes(tokenizer, results)
297
297
  print('--- Processed ---')
298
298
  print(processor.generate(results))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sembr
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: A semantic linebreaker powered by transformers
5
5
  Author: admk
6
6
  License-Expression: MIT
@@ -14,14 +14,15 @@ Requires-Python: >=3.10
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
16
  Requires-Dist: accelerate
17
- Requires-Dist: transformers
18
- Requires-Dist: torch
17
+ Requires-Dist: fastmcp
18
+ Requires-Dist: flask
19
+ Requires-Dist: mcp[cli]
19
20
  Requires-Dist: numpy
20
- Requires-Dist: tqdm
21
+ Requires-Dist: pydantic
21
22
  Requires-Dist: requests
22
- Requires-Dist: flask
23
- Requires-Dist: mcp[cli]>=1.2.0
24
- Requires-Dist: fastmcp>=2.10.6
23
+ Requires-Dist: torch
24
+ Requires-Dist: tqdm
25
+ Requires-Dist: transformers
25
26
  Dynamic: license-file
26
27
 
27
28
  # Semantic Line Breaker (SemBr)
@@ -1,9 +1,10 @@
1
1
  accelerate
2
- transformers
3
- torch
2
+ fastmcp
3
+ flask
4
+ mcp[cli]
4
5
  numpy
5
- tqdm
6
+ pydantic
6
7
  requests
7
- flask
8
- mcp[cli]>=1.2.0
9
- fastmcp>=2.10.6
8
+ torch
9
+ tqdm
10
+ transformers
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes