sembr 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sembr-0.2.2/sembr.egg-info → sembr-0.2.3}/PKG-INFO +8 -7
- {sembr-0.2.2 → sembr-0.2.3}/pyproject.toml +7 -6
- {sembr-0.2.2 → sembr-0.2.3}/sembr/__init__.py +1 -1
- {sembr-0.2.2 → sembr-0.2.3}/sembr/inference.py +1 -1
- {sembr-0.2.2 → sembr-0.2.3}/sembr/process.py +16 -16
- {sembr-0.2.2 → sembr-0.2.3/sembr.egg-info}/PKG-INFO +8 -7
- {sembr-0.2.2 → sembr-0.2.3}/sembr.egg-info/requires.txt +7 -6
- {sembr-0.2.2 → sembr-0.2.3}/LICENSE +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/README.md +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/cli.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/databuilder.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/dataset.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/eval.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/mcp.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/train.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr/utils.py +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr.egg-info/SOURCES.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr.egg-info/dependency_links.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr.egg-info/entry_points.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/sembr.egg-info/top_level.txt +0 -0
- {sembr-0.2.2 → sembr-0.2.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sembr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A semantic linebreaker powered by transformers
|
|
5
5
|
Author: admk
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,14 +14,15 @@ Requires-Python: >=3.10
|
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
16
|
Requires-Dist: accelerate
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
17
|
+
Requires-Dist: fastmcp
|
|
18
|
+
Requires-Dist: flask
|
|
19
|
+
Requires-Dist: mcp[cli]
|
|
19
20
|
Requires-Dist: numpy
|
|
20
|
-
Requires-Dist:
|
|
21
|
+
Requires-Dist: pydantic
|
|
21
22
|
Requires-Dist: requests
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: torch
|
|
24
|
+
Requires-Dist: tqdm
|
|
25
|
+
Requires-Dist: transformers
|
|
25
26
|
Dynamic: license-file
|
|
26
27
|
|
|
27
28
|
# Semantic Line Breaker (SemBr)
|
|
@@ -15,14 +15,15 @@ license-files = ["LICEN[CS]E*"]
|
|
|
15
15
|
requires-python = ">=3.10"
|
|
16
16
|
dependencies = [
|
|
17
17
|
"accelerate",
|
|
18
|
-
"
|
|
19
|
-
"
|
|
18
|
+
"fastmcp",
|
|
19
|
+
"flask",
|
|
20
|
+
"mcp[cli]",
|
|
20
21
|
"numpy",
|
|
21
|
-
"
|
|
22
|
+
"pydantic",
|
|
22
23
|
"requests",
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
24
|
+
"torch",
|
|
25
|
+
"tqdm",
|
|
26
|
+
"transformers",
|
|
26
27
|
]
|
|
27
28
|
classifiers = [
|
|
28
29
|
"Programming Language :: Python :: 3",
|
|
@@ -108,7 +108,7 @@ def inference(
|
|
|
108
108
|
if text.strip() == '':
|
|
109
109
|
return []
|
|
110
110
|
collator = DataCollatorForTokenClassification(tokenizer, padding='longest')
|
|
111
|
-
results = processor(text, split=isinstance(text, str))
|
|
111
|
+
results = processor.parse_text(text, split=isinstance(text, str))
|
|
112
112
|
results = processor.tokenize_with_modes(tokenizer, results)
|
|
113
113
|
logits, counts = _tiled_inference(
|
|
114
114
|
model, collator, results, batch_size, overlap_divisor)
|
|
@@ -143,6 +143,20 @@ class SemBrProcessor(object):
|
|
|
143
143
|
'base_indent': base_indent,
|
|
144
144
|
}
|
|
145
145
|
|
|
146
|
+
def parse_text(self, text, split=True):
|
|
147
|
+
text = text.replace('\t', ' ' * self.spaces)
|
|
148
|
+
if split:
|
|
149
|
+
text = re.split(r'\n(?:\s*\n)+', text)
|
|
150
|
+
elif isinstance(text, str):
|
|
151
|
+
raise ValueError(
|
|
152
|
+
'Text must be a list of strings if split=True.')
|
|
153
|
+
paras = []
|
|
154
|
+
for p in text:
|
|
155
|
+
if not p.strip():
|
|
156
|
+
continue
|
|
157
|
+
paras.append(self._process_paragraph(p))
|
|
158
|
+
return paras
|
|
159
|
+
|
|
146
160
|
def _tokenize_with_modes(
|
|
147
161
|
self, tokenizer, text, line_modes, line_mode_offsets, line_indents
|
|
148
162
|
):
|
|
@@ -201,19 +215,6 @@ class SemBrProcessor(object):
|
|
|
201
215
|
new_results.append(tokenized)
|
|
202
216
|
return new_results
|
|
203
217
|
|
|
204
|
-
def __call__(self, text, split=True):
|
|
205
|
-
if split:
|
|
206
|
-
text = re.split(r'\n(?:\s*\n)+', text)
|
|
207
|
-
elif isinstance(text, str):
|
|
208
|
-
raise ValueError(
|
|
209
|
-
'Text must be a list of strings if split=True.')
|
|
210
|
-
paras = []
|
|
211
|
-
for p in text:
|
|
212
|
-
if not p.strip():
|
|
213
|
-
continue
|
|
214
|
-
paras.append(self._process_paragraph(p))
|
|
215
|
-
return paras
|
|
216
|
-
|
|
217
218
|
def _replace_newlines(self, words, modes, indents):
|
|
218
219
|
new_words, new_modes, new_indents = [], [], []
|
|
219
220
|
next_mode = None
|
|
@@ -288,11 +289,10 @@ class SemBrProcessor(object):
|
|
|
288
289
|
|
|
289
290
|
|
|
290
291
|
if __name__ == '__main__':
|
|
291
|
-
|
|
292
|
-
test = open('./data/example.tex', 'r').read()
|
|
292
|
+
test = open('./data/raw/example.tex', 'r').read()
|
|
293
293
|
processor = SemBrProcessor()
|
|
294
294
|
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
|
295
|
-
results = processor(test)
|
|
295
|
+
results = processor.parse_text(test)
|
|
296
296
|
results = processor.tokenize_with_modes(tokenizer, results)
|
|
297
297
|
print('--- Processed ---')
|
|
298
298
|
print(processor.generate(results))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sembr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A semantic linebreaker powered by transformers
|
|
5
5
|
Author: admk
|
|
6
6
|
License-Expression: MIT
|
|
@@ -14,14 +14,15 @@ Requires-Python: >=3.10
|
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
16
|
Requires-Dist: accelerate
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
17
|
+
Requires-Dist: fastmcp
|
|
18
|
+
Requires-Dist: flask
|
|
19
|
+
Requires-Dist: mcp[cli]
|
|
19
20
|
Requires-Dist: numpy
|
|
20
|
-
Requires-Dist:
|
|
21
|
+
Requires-Dist: pydantic
|
|
21
22
|
Requires-Dist: requests
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: torch
|
|
24
|
+
Requires-Dist: tqdm
|
|
25
|
+
Requires-Dist: transformers
|
|
25
26
|
Dynamic: license-file
|
|
26
27
|
|
|
27
28
|
# Semantic Line Breaker (SemBr)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|