fragmentshot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fragmentshot-0.1.0/LICENSE +25 -0
- fragmentshot-0.1.0/PKG-INFO +72 -0
- fragmentshot-0.1.0/README.md +61 -0
- fragmentshot-0.1.0/fragmentshot/__init__.py +9 -0
- fragmentshot-0.1.0/fragmentshot/retriever.py +110 -0
- fragmentshot-0.1.0/fragmentshot.egg-info/PKG-INFO +72 -0
- fragmentshot-0.1.0/fragmentshot.egg-info/SOURCES.txt +10 -0
- fragmentshot-0.1.0/fragmentshot.egg-info/dependency_links.txt +1 -0
- fragmentshot-0.1.0/fragmentshot.egg-info/top_level.txt +1 -0
- fragmentshot-0.1.0/pyproject.toml +15 -0
- fragmentshot-0.1.0/setup.cfg +4 -0
- fragmentshot-0.1.0/tests/test_retriever.py +43 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)
|
|
2
|
+
|
|
3
|
+
By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License ("Public License").
|
|
4
|
+
|
|
5
|
+
You are free to:
|
|
6
|
+
- Share — copy and redistribute the material in any medium or format
|
|
7
|
+
- Adapt — remix, transform, and build upon the material
|
|
8
|
+
for any purpose, even commercially.
|
|
9
|
+
|
|
10
|
+
Under the following terms:
|
|
11
|
+
- Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made.
|
|
12
|
+
You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
|
|
13
|
+
|
|
14
|
+
- ShareAlike — If you remix, transform, or build upon the material, you must distribute your contributions under
|
|
15
|
+
the same license as the original.
|
|
16
|
+
|
|
17
|
+
No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
|
|
18
|
+
|
|
19
|
+
Notices:
|
|
20
|
+
- You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
|
|
21
|
+
- No warranties are given. The license may not give you all of the permissions necessary for your intended use.
|
|
22
|
+
For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
|
|
23
|
+
|
|
24
|
+
For the full license text, please visit:
|
|
25
|
+
https://creativecommons.org/licenses/by-sa/4.0/legalcode
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fragmentshot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Prompt generation for Fragment-Shot and Pivoted Fragment-Shot translation using LLMs
|
|
5
|
+
Author-email: Samuel Frontull <samuel.frontull@uibk.ac.at>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/schtailmuel/fragmentshot
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
|
|
12
|
+
# FragmentShot Retriever
|
|
13
|
+
|
|
14
|
+
A Python package for retrieving exemplary translations for text based on fragments from parallel corpora.
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- Fragment extraction from source and target texts
|
|
19
|
+
- Configurable maximum fragment size
|
|
20
|
+
- Option to enable or disable fragment overlaps
|
|
21
|
+
- Easy integration for retrieval workflows
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
You can install this package locally:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Or clone the repo and install in editable mode:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/schtailmuel/fragmentshot.git
|
|
35
|
+
cd fragmentshot
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from fragmentshot.retriever import FragmentShotsRetriever
|
|
43
|
+
|
|
44
|
+
src_texts = [
|
|
45
|
+
"This is a sample source sentence.",
|
|
46
|
+
"Another example source sentence."
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
tgt_texts = [
|
|
50
|
+
"Dies ist ein Beispiel im Zieltext.",
|
|
51
|
+
"Noch ein Beispiel."
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
result = FragmentShotsRetriever(src_texts, tgt_texts, max_fragment_size=5, overlaps=False)
|
|
55
|
+
|
|
56
|
+
# Access extracted shots
|
|
57
|
+
print(result)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Testing
|
|
61
|
+
|
|
62
|
+
Run unit tests with:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
python -m unittest discover tests
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## License
|
|
69
|
+
|
|
70
|
+
This project is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
|
|
71
|
+
|
|
72
|
+
Created by Samuel Frontull
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# FragmentShot Retriever
|
|
2
|
+
|
|
3
|
+
A Python package for retrieving exemplary translations for text based on fragments from parallel corpora.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Fragment extraction from source and target texts
|
|
8
|
+
- Configurable maximum fragment size
|
|
9
|
+
- Option to enable or disable fragment overlaps
|
|
10
|
+
- Easy integration for retrieval workflows
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
You can install this package locally:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install .
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Or clone the repo and install in editable mode:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/schtailmuel/fragmentshot.git
|
|
24
|
+
cd fragmentshot
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from fragmentshot.retriever import FragmentShotsRetriever
|
|
32
|
+
|
|
33
|
+
src_texts = [
|
|
34
|
+
"This is a sample source sentence.",
|
|
35
|
+
"Another example source sentence."
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
tgt_texts = [
|
|
39
|
+
"Dies ist ein Beispiel im Zieltext.",
|
|
40
|
+
"Noch ein Beispiel."
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
result = FragmentShotsRetriever(src_texts, tgt_texts, max_fragment_size=5, overlaps=False)
|
|
44
|
+
|
|
45
|
+
# Access extracted shots
|
|
46
|
+
print(result)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Testing
|
|
50
|
+
|
|
51
|
+
Run unit tests with:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
python -m unittest discover tests
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## License
|
|
58
|
+
|
|
59
|
+
This project is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
|
|
60
|
+
|
|
61
|
+
Created by Samuel Frontull
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import random
|
|
3
|
+
|
|
4
|
+
class FragmentShotsRetriever:
|
|
5
|
+
|
|
6
|
+
def __init__(self, src_texts, tgt_texts, max_fragment_size=7, overlaps=False):
|
|
7
|
+
|
|
8
|
+
if len(src_texts) != len(tgt_texts):
|
|
9
|
+
raise ValueError(
|
|
10
|
+
"Source and target files must have the same number of lines."
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
self.src_texts = src_texts
|
|
14
|
+
self.tgt_texts = tgt_texts
|
|
15
|
+
self.overlaps = overlaps
|
|
16
|
+
self.max_fragment_size = max_fragment_size
|
|
17
|
+
|
|
18
|
+
self.corpus_fragments_str = {}
|
|
19
|
+
self.corpus_fragments_idx = {}
|
|
20
|
+
self._init_corpus_fragments()
|
|
21
|
+
|
|
22
|
+
def _init_corpus_fragments(self):
|
|
23
|
+
|
|
24
|
+
src_tok = [self._remove_punctuation(s).split() for s in self.src_texts]
|
|
25
|
+
|
|
26
|
+
for size in range(1, self.max_fragment_size + 1):
|
|
27
|
+
|
|
28
|
+
self.corpus_fragments_str[size] = []
|
|
29
|
+
self.corpus_fragments_idx[size] = []
|
|
30
|
+
|
|
31
|
+
for i, src_sent in enumerate(src_tok):
|
|
32
|
+
res = self._create_fragments(src_sent, size)
|
|
33
|
+
for tok in res:
|
|
34
|
+
self.corpus_fragments_str[size].append(tok)
|
|
35
|
+
self.corpus_fragments_idx[size].append(i)
|
|
36
|
+
|
|
37
|
+
def _remove_punctuation(self, text):
|
|
38
|
+
text = re.sub(r"[ ]+", " ", text)
|
|
39
|
+
text = re.sub(r"[.,!?:;]", "", text)
|
|
40
|
+
return text
|
|
41
|
+
|
|
42
|
+
def _create_fragments(self, xs, n):
|
|
43
|
+
|
|
44
|
+
fragments = []
|
|
45
|
+
|
|
46
|
+
for i in range(len(xs) - n + 1):
|
|
47
|
+
fragments.append(xs[i : i + n])
|
|
48
|
+
|
|
49
|
+
return fragments
|
|
50
|
+
|
|
51
|
+
def get_fragment_shots(self, text, num_shots=6):
|
|
52
|
+
"""
|
|
53
|
+
Retrieve fragments based on a text.
|
|
54
|
+
"""
|
|
55
|
+
shots = []
|
|
56
|
+
|
|
57
|
+
text_tokenized = self._remove_punctuation(text).split()
|
|
58
|
+
start_size = min(len(text_tokenized), self.max_fragment_size)
|
|
59
|
+
|
|
60
|
+
for size in range(start_size, 0, -1):
|
|
61
|
+
|
|
62
|
+
src_fragments = self._create_fragments(text_tokenized, size)
|
|
63
|
+
wi_marked = []
|
|
64
|
+
|
|
65
|
+
for f_idx, fragment in enumerate(src_fragments):
|
|
66
|
+
|
|
67
|
+
if ("#" in fragment) or (f_idx in wi_marked and not self.overlaps):
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
fragment_lower = [x.lower() for x in fragment]
|
|
71
|
+
|
|
72
|
+
match_idxs = [
|
|
73
|
+
_idx
|
|
74
|
+
for _idx, x in enumerate(self.corpus_fragments_str[size])
|
|
75
|
+
if x == fragment or x == fragment_lower
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
if match_idxs:
|
|
79
|
+
|
|
80
|
+
sent_ids = [self.corpus_fragments_idx[size][i] for i in match_idxs]
|
|
81
|
+
random.shuffle(sent_ids)
|
|
82
|
+
|
|
83
|
+
examples = []
|
|
84
|
+
|
|
85
|
+
for sent_id in sent_ids:
|
|
86
|
+
|
|
87
|
+
examples.append({
|
|
88
|
+
"src_text": self.src_texts[sent_id],
|
|
89
|
+
"tgt_text": self.tgt_texts[sent_id]
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
if len(examples) >= num_shots:
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
shots.append(
|
|
96
|
+
{
|
|
97
|
+
"fragment": " ".join(fragment),
|
|
98
|
+
"examples": examples
|
|
99
|
+
}
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
for j in range(f_idx, f_idx + size):
|
|
103
|
+
text_tokenized[j] = "#"
|
|
104
|
+
wi_marked.append(j)
|
|
105
|
+
|
|
106
|
+
return {
|
|
107
|
+
"shots": shots,
|
|
108
|
+
"num_words": len(text_tokenized),
|
|
109
|
+
"unknown": [x for x in text_tokenized if x != "#"]
|
|
110
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fragmentshot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Prompt generation for Fragment-Shot and Pivoted Fragment-Shot translation using LLMs
|
|
5
|
+
Author-email: Samuel Frontull <samuel.frontull@uibk.ac.at>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/schtailmuel/fragmentshot
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
|
|
12
|
+
# FragmentShot Retriever
|
|
13
|
+
|
|
14
|
+
A Python package for retrieving exemplary translations for text based on fragments from parallel corpora.
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- Fragment extraction from source and target texts
|
|
19
|
+
- Configurable maximum fragment size
|
|
20
|
+
- Option to enable or disable fragment overlaps
|
|
21
|
+
- Easy integration for retrieval workflows
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
You can install this package locally:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Or clone the repo and install in editable mode:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/schtailmuel/fragmentshot.git
|
|
35
|
+
cd fragmentshot
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from fragmentshot.retriever import FragmentShotsRetriever
|
|
43
|
+
|
|
44
|
+
src_texts = [
|
|
45
|
+
"This is a sample source sentence.",
|
|
46
|
+
"Another example source sentence."
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
tgt_texts = [
|
|
50
|
+
"Dies ist ein Beispiel im Zieltext.",
|
|
51
|
+
"Noch ein Beispiel."
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
result = FragmentShotsRetriever(src_texts, tgt_texts, max_fragment_size=5, overlaps=False)
|
|
55
|
+
|
|
56
|
+
# Access extracted shots
|
|
57
|
+
print(result)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Testing
|
|
61
|
+
|
|
62
|
+
Run unit tests with:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
python -m unittest discover tests
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## License
|
|
69
|
+
|
|
70
|
+
This project is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
|
|
71
|
+
|
|
72
|
+
Created by Samuel Frontull
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
fragmentshot/__init__.py
|
|
5
|
+
fragmentshot/retriever.py
|
|
6
|
+
fragmentshot.egg-info/PKG-INFO
|
|
7
|
+
fragmentshot.egg-info/SOURCES.txt
|
|
8
|
+
fragmentshot.egg-info/dependency_links.txt
|
|
9
|
+
fragmentshot.egg-info/top_level.txt
|
|
10
|
+
tests/test_retriever.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fragmentshot
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "fragmentshot"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Prompt generation for Fragment-Shot and Pivoted Fragment-Shot translation using LLMs"
|
|
9
|
+
authors = [{ name="Samuel Frontull", email="samuel.frontull@uibk.ac.at" }]
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
dependencies = []
|
|
13
|
+
|
|
14
|
+
[project.urls]
|
|
15
|
+
Homepage = "https://github.com/schtailmuel/fragmentshot"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from fragmentshot.retriever import FragmentShotsRetriever
|
|
3
|
+
|
|
4
|
+
class TestFragmentShotsRetriever(unittest.TestCase):
|
|
5
|
+
|
|
6
|
+
@classmethod
|
|
7
|
+
def setUp(self):
|
|
8
|
+
self.src_texts = [
|
|
9
|
+
"The cat sleeps on the couch",
|
|
10
|
+
"A dog runs in the garden"
|
|
11
|
+
]
|
|
12
|
+
self.tgt_texts = [
|
|
13
|
+
"Il gatto dorme sul divano",
|
|
14
|
+
"Un cane corre nel giardino"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
def test_constructor_with_valid_input(self):
|
|
18
|
+
retriever = FragmentShotsRetriever(self.src_texts, self.tgt_texts)
|
|
19
|
+
self.assertEqual(len(retriever.src_texts), 2)
|
|
20
|
+
self.assertEqual(len(retriever.tgt_texts), 2)
|
|
21
|
+
self.assertFalse(retriever.overlaps)
|
|
22
|
+
self.assertEqual(retriever.max_fragment_size, 7)
|
|
23
|
+
|
|
24
|
+
def test_constructor_raises_error_on_mismatched_input(self):
|
|
25
|
+
with self.assertRaises(ValueError):
|
|
26
|
+
FragmentShotsRetriever(self.src_texts, self.tgt_texts + ["extra"])
|
|
27
|
+
|
|
28
|
+
def test_fragment_initialization(self):
|
|
29
|
+
retriever = FragmentShotsRetriever(self.src_texts, self.tgt_texts, max_fragment_size=3)
|
|
30
|
+
|
|
31
|
+
self.assertIn(3, retriever.corpus_fragments_str)
|
|
32
|
+
self.assertIn(3, retriever.corpus_fragments_idx)
|
|
33
|
+
self.assertTrue(len(retriever.corpus_fragments_str[3]) > 0)
|
|
34
|
+
self.assertEqual(len(retriever.corpus_fragments_str[3]), len(retriever.corpus_fragments_idx[3]))
|
|
35
|
+
|
|
36
|
+
def test_retrieve(self):
|
|
37
|
+
input_sentence = "my dogs sleeps on the floow"
|
|
38
|
+
retriever = FragmentShotsRetriever(self.src_texts, self.tgt_texts)
|
|
39
|
+
results = retriever.get_fragment_shots(input_sentence)
|
|
40
|
+
self.assertEqual(len(results['shots']), 1)
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
unittest.main()
|