fragmentshot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)
2
+
3
+ By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License ("Public License").
4
+
5
+ You are free to:
6
+ - Share — copy and redistribute the material in any medium or format
7
+ - Adapt — remix, transform, and build upon the material
8
+ for any purpose, even commercially.
9
+
10
+ Under the following terms:
11
+ - Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made.
12
+ You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
13
+
14
+ - ShareAlike — If you remix, transform, or build upon the material, you must distribute your contributions under
15
+ the same license as the original.
16
+
17
+ No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
18
+
19
+ Notices:
20
+ - You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
21
+ - No warranties are given. The license may not give you all of the permissions necessary for your intended use.
22
+ For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
23
+
24
+ For the full license text, please visit:
25
+ https://creativecommons.org/licenses/by-sa/4.0/legalcode
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: fragmentshot
3
+ Version: 0.1.0
4
+ Summary: Prompt generation for Fragment-Shot and Pivoted Fragment-Shot translation using LLMs
5
+ Author-email: Samuel Frontull <samuel.frontull@uibk.ac.at>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/schtailmuel/fragmentshot
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Dynamic: license-file
11
+
12
+ # FragmentShot Retriever
13
+
14
+ A Python package for retrieving exemplary translations for text based on fragments from parallel corpora.
15
+
16
+ ## Features
17
+
18
+ - Fragment extraction from source and target texts
19
+ - Configurable maximum fragment size
20
+ - Option to enable or disable fragment overlaps
21
+ - Easy integration for retrieval workflows
22
+
23
+ ## Installation
24
+
25
+ You can install this package locally:
26
+
27
+ ```bash
28
+ pip install .
29
+ ```
30
+
31
+ Or clone the repo and install in editable mode:
32
+
33
+ ```bash
34
+ git clone https://github.com/schtailmuel/fragmentshot.git
35
+ cd fragmentshot
36
+ pip install -e .
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```python
42
+ from fragmentshot.retriever import FragmentShotsRetriever
43
+
44
+ src_texts = [
45
+ "This is a sample source sentence.",
46
+ "Another example source sentence."
47
+ ]
48
+
49
+ tgt_texts = [
50
+ "Dies ist ein Beispiel im Zieltext.",
51
+ "Noch ein Beispiel."
52
+ ]
53
+
54
+ result = FragmentShotsRetriever(src_texts, tgt_texts, max_fragment_size=5, overlaps=False)
55
+
56
+ # Access extracted shots
57
+ print(result)
58
+ ```
59
+
60
+ ## Testing
61
+
62
+ Run unit tests with:
63
+
64
+ ```bash
65
+ python -m unittest discover tests
66
+ ```
67
+
68
+ ## License
69
+
70
+ This project is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
71
+
72
+ Created by Samuel Frontull
@@ -0,0 +1,61 @@
1
+ # FragmentShot Retriever
2
+
3
+ A Python package for retrieving exemplary translations for text based on fragments from parallel corpora.
4
+
5
+ ## Features
6
+
7
+ - Fragment extraction from source and target texts
8
+ - Configurable maximum fragment size
9
+ - Option to enable or disable fragment overlaps
10
+ - Easy integration for retrieval workflows
11
+
12
+ ## Installation
13
+
14
+ You can install this package locally:
15
+
16
+ ```bash
17
+ pip install .
18
+ ```
19
+
20
+ Or clone the repo and install in editable mode:
21
+
22
+ ```bash
23
+ git clone https://github.com/schtailmuel/fragmentshot.git
24
+ cd fragmentshot
25
+ pip install -e .
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ```python
31
+ from fragmentshot.retriever import FragmentShotsRetriever
32
+
33
+ src_texts = [
34
+ "This is a sample source sentence.",
35
+ "Another example source sentence."
36
+ ]
37
+
38
+ tgt_texts = [
39
+ "Dies ist ein Beispiel im Zieltext.",
40
+ "Noch ein Beispiel."
41
+ ]
42
+
43
+ result = FragmentShotsRetriever(src_texts, tgt_texts, max_fragment_size=5, overlaps=False)
44
+
45
+ # Access extracted shots
46
+ print(result)
47
+ ```
48
+
49
+ ## Testing
50
+
51
+ Run unit tests with:
52
+
53
+ ```bash
54
+ python -m unittest discover tests
55
+ ```
56
+
57
+ ## License
58
+
59
+ This project is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
60
+
61
+ Created by Samuel Frontull
@@ -0,0 +1,9 @@
1
+ # fragmentshot/__init__.py
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from .retriever import FragmentShotsRetriever
6
+
7
+ __all__ = [
8
+ "get_fragment_shots"
9
+ ]
@@ -0,0 +1,110 @@
1
+ import re
2
+ import random
3
+
4
+ class FragmentShotsRetriever:
5
+
6
+ def __init__(self, src_texts, tgt_texts, max_fragment_size=7, overlaps=False):
7
+
8
+ if len(src_texts) != len(tgt_texts):
9
+ raise ValueError(
10
+ "Source and target files must have the same number of lines."
11
+ )
12
+
13
+ self.src_texts = src_texts
14
+ self.tgt_texts = tgt_texts
15
+ self.overlaps = overlaps
16
+ self.max_fragment_size = max_fragment_size
17
+
18
+ self.corpus_fragments_str = {}
19
+ self.corpus_fragments_idx = {}
20
+ self._init_corpus_fragments()
21
+
22
+ def _init_corpus_fragments(self):
23
+
24
+ src_tok = [self._remove_punctuation(s).split() for s in self.src_texts]
25
+
26
+ for size in range(1, self.max_fragment_size + 1):
27
+
28
+ self.corpus_fragments_str[size] = []
29
+ self.corpus_fragments_idx[size] = []
30
+
31
+ for i, src_sent in enumerate(src_tok):
32
+ res = self._create_fragments(src_sent, size)
33
+ for tok in res:
34
+ self.corpus_fragments_str[size].append(tok)
35
+ self.corpus_fragments_idx[size].append(i)
36
+
37
+ def _remove_punctuation(self, text):
38
+ text = re.sub(r"[ ]+", " ", text)
39
+ text = re.sub(r"[.,!?:;]", "", text)
40
+ return text
41
+
42
+ def _create_fragments(self, xs, n):
43
+
44
+ fragments = []
45
+
46
+ for i in range(len(xs) - n + 1):
47
+ fragments.append(xs[i : i + n])
48
+
49
+ return fragments
50
+
51
+ def get_fragment_shots(self, text, num_shots=6):
52
+ """
53
+ Retrieve fragments based on a text.
54
+ """
55
+ shots = []
56
+
57
+ text_tokenized = self._remove_punctuation(text).split()
58
+ start_size = min(len(text_tokenized), self.max_fragment_size)
59
+
60
+ for size in range(start_size, 0, -1):
61
+
62
+ src_fragments = self._create_fragments(text_tokenized, size)
63
+ wi_marked = []
64
+
65
+ for f_idx, fragment in enumerate(src_fragments):
66
+
67
+ if ("#" in fragment) or (f_idx in wi_marked and not self.overlaps):
68
+ continue
69
+
70
+ fragment_lower = [x.lower() for x in fragment]
71
+
72
+ match_idxs = [
73
+ _idx
74
+ for _idx, x in enumerate(self.corpus_fragments_str[size])
75
+ if x == fragment or x == fragment_lower
76
+ ]
77
+
78
+ if match_idxs:
79
+
80
+ sent_ids = [self.corpus_fragments_idx[size][i] for i in match_idxs]
81
+ random.shuffle(sent_ids)
82
+
83
+ examples = []
84
+
85
+ for sent_id in sent_ids:
86
+
87
+ examples.append({
88
+ "src_text": self.src_texts[sent_id],
89
+ "tgt_text": self.tgt_texts[sent_id]
90
+ })
91
+
92
+ if len(examples) >= num_shots:
93
+ break
94
+
95
+ shots.append(
96
+ {
97
+ "fragment": " ".join(fragment),
98
+ "examples": examples
99
+ }
100
+ )
101
+
102
+ for j in range(f_idx, f_idx + size):
103
+ text_tokenized[j] = "#"
104
+ wi_marked.append(j)
105
+
106
+ return {
107
+ "shots": shots,
108
+ "num_words": len(text_tokenized),
109
+ "unknown": [x for x in text_tokenized if x != "#"]
110
+ }
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: fragmentshot
3
+ Version: 0.1.0
4
+ Summary: Prompt generation for Fragment-Shot and Pivoted Fragment-Shot translation using LLMs
5
+ Author-email: Samuel Frontull <samuel.frontull@uibk.ac.at>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/schtailmuel/fragmentshot
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Dynamic: license-file
11
+
12
+ # FragmentShot Retriever
13
+
14
+ A Python package for retrieving exemplary translations for text based on fragments from parallel corpora.
15
+
16
+ ## Features
17
+
18
+ - Fragment extraction from source and target texts
19
+ - Configurable maximum fragment size
20
+ - Option to enable or disable fragment overlaps
21
+ - Easy integration for retrieval workflows
22
+
23
+ ## Installation
24
+
25
+ You can install this package locally:
26
+
27
+ ```bash
28
+ pip install .
29
+ ```
30
+
31
+ Or clone the repo and install in editable mode:
32
+
33
+ ```bash
34
+ git clone https://github.com/schtailmuel/fragmentshot.git
35
+ cd fragmentshot
36
+ pip install -e .
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```python
42
+ from fragmentshot.retriever import FragmentShotsRetriever
43
+
44
+ src_texts = [
45
+ "This is a sample source sentence.",
46
+ "Another example source sentence."
47
+ ]
48
+
49
+ tgt_texts = [
50
+ "Dies ist ein Beispiel im Zieltext.",
51
+ "Noch ein Beispiel."
52
+ ]
53
+
54
+ result = FragmentShotsRetriever(src_texts, tgt_texts, max_fragment_size=5, overlaps=False)
55
+
56
+ # Access extracted shots
57
+ print(result)
58
+ ```
59
+
60
+ ## Testing
61
+
62
+ Run unit tests with:
63
+
64
+ ```bash
65
+ python -m unittest discover tests
66
+ ```
67
+
68
+ ## License
69
+
70
+ This project is licensed under the Creative Commons Attribution-ShareAlike 4.0 International License.
71
+
72
+ Created by Samuel Frontull
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ fragmentshot/__init__.py
5
+ fragmentshot/retriever.py
6
+ fragmentshot.egg-info/PKG-INFO
7
+ fragmentshot.egg-info/SOURCES.txt
8
+ fragmentshot.egg-info/dependency_links.txt
9
+ fragmentshot.egg-info/top_level.txt
10
+ tests/test_retriever.py
@@ -0,0 +1 @@
1
+ fragmentshot
@@ -0,0 +1,15 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "fragmentshot"
7
+ version = "0.1.0"
8
+ description = "Prompt generation for Fragment-Shot and Pivoted Fragment-Shot translation using LLMs"
9
+ authors = [{ name="Samuel Frontull", email="samuel.frontull@uibk.ac.at" }]
10
+ license = { text = "MIT" }
11
+ readme = "README.md"
12
+ dependencies = []
13
+
14
+ [project.urls]
15
+ Homepage = "https://github.com/schtailmuel/fragmentshot"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,43 @@
1
+ import unittest
2
+ from fragmentshot.retriever import FragmentShotsRetriever
3
+
4
+ class TestFragmentShotsRetriever(unittest.TestCase):
5
+
6
+ @classmethod
7
+ def setUp(self):
8
+ self.src_texts = [
9
+ "The cat sleeps on the couch",
10
+ "A dog runs in the garden"
11
+ ]
12
+ self.tgt_texts = [
13
+ "Il gatto dorme sul divano",
14
+ "Un cane corre nel giardino"
15
+ ]
16
+
17
+ def test_constructor_with_valid_input(self):
18
+ retriever = FragmentShotsRetriever(self.src_texts, self.tgt_texts)
19
+ self.assertEqual(len(retriever.src_texts), 2)
20
+ self.assertEqual(len(retriever.tgt_texts), 2)
21
+ self.assertFalse(retriever.overlaps)
22
+ self.assertEqual(retriever.max_fragment_size, 7)
23
+
24
+ def test_constructor_raises_error_on_mismatched_input(self):
25
+ with self.assertRaises(ValueError):
26
+ FragmentShotsRetriever(self.src_texts, self.tgt_texts + ["extra"])
27
+
28
+ def test_fragment_initialization(self):
29
+ retriever = FragmentShotsRetriever(self.src_texts, self.tgt_texts, max_fragment_size=3)
30
+
31
+ self.assertIn(3, retriever.corpus_fragments_str)
32
+ self.assertIn(3, retriever.corpus_fragments_idx)
33
+ self.assertTrue(len(retriever.corpus_fragments_str[3]) > 0)
34
+ self.assertEqual(len(retriever.corpus_fragments_str[3]), len(retriever.corpus_fragments_idx[3]))
35
+
36
+ def test_retrieve(self):
37
+ input_sentence = "my dogs sleeps on the floow"
38
+ retriever = FragmentShotsRetriever(self.src_texts, self.tgt_texts)
39
+ results = retriever.get_fragment_shots(input_sentence)
40
+ self.assertEqual(len(results['shots']), 1)
41
+
42
+ if __name__ == "__main__":
43
+ unittest.main()