disvortilo 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
disvortilo/__init__.py ADDED
@@ -0,0 +1,172 @@
1
+ import importlib.resources
2
+ import re
3
+ from collections.abc import Generator
4
+
5
+
6
+ def load_word_list(resource_name: str) -> set[str]:
7
+ result = []
8
+ for line in importlib.resources.files(__package__).joinpath(resource_name).read_text("utf-8").splitlines():
9
+ # Remove comments
10
+ word, _, _ = line.partition("#")
11
+ word = word.strip()
12
+
13
+ if word: # Ignore empty lines
14
+ result.append(word)
15
+
16
+ return set(result)
17
+
18
+
19
+ def growing_string(string: str) -> Generator[str]:
20
+ before = ""
21
+ for char in string:
22
+ before += char
23
+ yield before
24
+
25
+
26
+ WORD_ENDS = {
27
+ "e", "en",
28
+ "a", "an", "ajn", "aj",
29
+ "o", "on", "ojn", "oj",
30
+ "as", "os", "is", "us", "u", "i"
31
+ }
32
+ CORRELATIVE_WORD_STARTS = {
33
+ "ki", "ti", "i", "ĉi", "neni"
34
+ }
35
+ CORRELATIVE_WORD_ENDS = {
36
+ "o", "on", "oj", "ojn",
37
+ "u", "un", "uj", "ujn",
38
+ "a",
39
+ "e", "en",
40
+ "am", "el", "es", "om", "al"
41
+ }
42
+
43
+
44
+ class Disvortilo:
45
+ def __init__(self):
46
+ self.suffixes = load_word_list("suffixes.txt")
47
+ self.prefixes = load_word_list("prefixes.txt")
48
+ self.roots = load_word_list("roots.txt")
49
+ self.full_words = load_word_list("full_words.txt")
50
+
51
+ def _is_in(self, word: str, _suffix, _prefix, _root, _full_word):
52
+ if _root and word in self.roots:
53
+ return "root"
54
+ elif _suffix and word in self.suffixes:
55
+ return "suffix"
56
+ elif _prefix and word in self.prefixes:
57
+ return "prefix"
58
+ elif _full_word and word in self.full_words:
59
+ return "full_words"
60
+
61
+ return ""
62
+
63
+ def _parse_correlative(self, word: str) -> list[tuple[str, ...]]:
64
+ for part in growing_string(word):
65
+ if part in CORRELATIVE_WORD_STARTS:
66
+ prefix = part
67
+ remaining = word[len(part):]
68
+ break
69
+ else:
70
+ # word didn't match the word starts
71
+ return []
72
+
73
+ if remaining in CORRELATIVE_WORD_ENDS:
74
+ return [(prefix, remaining)]
75
+
76
+ return []
77
+
78
+ def _parse_number(self, word: str) -> list[tuple[str, ...]]:
79
+ valid = []
80
+ for part in growing_string(word):
81
+ if part.isdigit():
82
+ remaining = word[len(part):]
83
+ if not remaining:
84
+ valid.append((part,))
85
+ elif remaining in ("a", "an"):
86
+ valid.append((part, remaining))
87
+
88
+ return valid
89
+
90
+ def parse(
91
+ self,
92
+ word: str,
93
+
94
+ # Controls the valid next part
95
+ _suffix: bool = False,
96
+ _prefix: bool = True,
97
+ _root: bool = True,
98
+ _full_word_integrated: bool = True,
99
+ _correlative: bool = True,
100
+ _full_word_standalone: bool = True,
101
+ _number: bool = True
102
+ ) -> list[tuple[str, ...]]:
103
+ if _full_word_standalone and word in self.full_words:
104
+ return [(word,)]
105
+
106
+ if _correlative:
107
+ correlative = self._parse_correlative(word)
108
+ if correlative:
109
+ return correlative
110
+
111
+ if _number:
112
+ number = self._parse_number(word)
113
+ if number:
114
+ return number
115
+
116
+ valid = []
117
+ for part in growing_string(word):
118
+ if check := self._is_in(part, _suffix, _prefix, _root, _full_word_integrated):
119
+ remaining = word[len(part):]
120
+ if remaining.startswith("o") and len(remaining) > 1:
121
+ remaining_parsed = self.parse(
122
+ remaining[1:],
123
+ _correlative=False,
124
+ _full_word_standalone=False,
125
+ _suffix=False,
126
+ _prefix=False,
127
+ _number=False
128
+ )
129
+ for parsed_part in remaining_parsed:
130
+ valid.append((part, "o") + parsed_part)
131
+
132
+ if check != "prefix" and remaining in WORD_ENDS:
133
+ # Allow if the prefix can be used as a root too. Disallow an end after a prefix
134
+ valid.append((part, remaining))
135
+ else: # try recursion
136
+ remaining_parsed = self.parse(
137
+ remaining,
138
+ _correlative=False,
139
+ _full_word_standalone=False,
140
+ _suffix=True,
141
+ _number=False
142
+ )
143
+ for parsed_part in remaining_parsed:
144
+ valid.append((part,) + parsed_part)
145
+
146
+ return valid
147
+
148
+
149
+ _ESPERANTO_SPLIT_WORDS = r"[A-Za-zĉĝĥĵŝŭĈĜĤĴŜŬ0-9]+"
150
+
151
+
152
+ def _split_sentence(sentence: str):
153
+ return re.findall(_ESPERANTO_SPLIT_WORDS, sentence)
154
+
155
+
156
+ def _parse_sentence(sentence: str):
157
+ words = _split_sentence(sentence)
158
+
159
+ disvortilo = Disvortilo()
160
+
161
+ parsed_words = (disvortilo.parse(word) or word for word in words)
162
+
163
+ end = "\n"
164
+ sep = "·"
165
+
166
+ for parsed in parsed_words:
167
+ if isinstance(parsed, str):
168
+ print(f"~{parsed}~", end=end)
169
+ else:
170
+ print(" ".join(sep.join(option) for option in parsed), end=end)
171
+
172
+ print()
@@ -0,0 +1,113 @@
1
+ min
2
+ vin
3
+ ŝin
4
+ lin
5
+ ĝin
6
+ nin
7
+ ilin
8
+
9
+ morgaŭ
10
+ sen
11
+
12
+ # Taken from eoparser/full_words.txt:
13
+
14
+ adiaŭ
15
+ al
16
+ ankaŭ
17
+ ankoraŭ
18
+ anstataŭ
19
+ apenaŭ
20
+ antaŭ
21
+ apud
22
+
23
+ baldaŭ
24
+ cent
25
+ ci
26
+ ĉi
27
+ ĉar
28
+ ĉe
29
+ ĉirkaŭ
30
+ ĉu
31
+ da
32
+ de
33
+ do
34
+ dum
35
+
36
+ el
37
+ en
38
+ for
39
+ ĝis
40
+ ĝi
41
+ hieraŭ
42
+ hodiaŭ
43
+ ili
44
+ inter
45
+ jam
46
+ ja
47
+ je
48
+ jen
49
+ jes
50
+ ĵus
51
+ kaj
52
+ kontraŭ
53
+ ke
54
+ kun
55
+ kvankam
56
+ kvazaŭ
57
+ la
58
+ laŭ
59
+ li
60
+ mi
61
+ mem
62
+ ne
63
+ nek
64
+ ni
65
+ nur
66
+ nun
67
+ nu
68
+ oni
69
+ ol
70
+ plu
71
+ por
72
+ per
73
+ post
74
+ preskaŭ
75
+ preter
76
+ pro
77
+ plej
78
+ pli
79
+ si
80
+ se
81
+ sed
82
+ sur
83
+ tra
84
+ tamen
85
+ trans
86
+ tre
87
+ tro
88
+ tuj
89
+ vi
90
+ l'
91
+ sub
92
+ ho
93
+ aux
94
+ ekde
95
+ pri
96
+ ili
97
+ ajn
98
+ ŝi
99
+ almenaŭ
100
+ ĉirkaŭ
101
+ ktp
102
+ ia
103
+ super
104
+ nord
105
+ krom
106
+ ambaŭ
107
+ malpli
108
+ escepte
109
+ #okcident
110
+ #nord
111
+ #sud
112
+ #orient
113
+ #centr
@@ -0,0 +1,10 @@
1
+ bo # in-law
2
+ dis # dis- (scattering)
3
+ ek # sudden or momentary action
4
+ eks # former
5
+ fi # shameful, nasty
6
+ ge # of both sexes
7
+ mal # opposite
8
+ mis # wrong, incorrect
9
+ pra # ancient
10
+ re # again, back
disvortilo/roots.txt ADDED
@@ -0,0 +1,502 @@
1
+ esperant
2
+ fingr
3
+
4
+
5
+ absolut
6
+ adres
7
+ afer
8
+ afrik
9
+ ag
10
+ akcept
11
+ akv
12
+ ali
13
+ alt
14
+ am
15
+ amas
16
+ amerik
17
+ amik
18
+ ampleks
19
+ amuz
20
+ angl
21
+ anonc
22
+ aparat
23
+ apart
24
+ aper
25
+ aranĝ
26
+ asoci
27
+ aspekt
28
+ atend
29
+ atent
30
+ ating
31
+ av
32
+ aĉet
33
+
34
+ aŭd
35
+ aŭskult
36
+ aŭtobus
37
+ banan
38
+ batal
39
+ baz
40
+ bedaŭr
41
+ bel
42
+ bend
43
+ bezon
44
+ bibliotek
45
+ bild
46
+ bilet
47
+ bird
48
+ bon
49
+ botel
50
+ bulgar
51
+ cel
52
+ centr
53
+ cert
54
+ ceter
55
+ cigared
56
+ dan
57
+ dank
58
+ daŭr
59
+ debat
60
+ decid
61
+ dek
62
+ dekstr
63
+ delegaci
64
+ demand
65
+ dev
66
+ dezir
67
+ deĵor
68
+ diabl
69
+ difin
70
+ dir
71
+ diskriminaci
72
+ diskut
73
+ divers
74
+ divid
75
+ dom
76
+ don
77
+ donac
78
+ dorm
79
+ du
80
+ edz
81
+ ekskurs
82
+ ekster
83
+ ekzempl
84
+ ekzempler
85
+ ekzist
86
+ elekt
87
+ entrepren
88
+ erinac
89
+ esenc
90
+ esper
91
+ est
92
+ eventual
93
+ evolu
94
+ eŭrop
95
+ facil
96
+ fak
97
+ fakt
98
+ fal
99
+ famili
100
+ far
101
+ fart
102
+ feliĉ
103
+ ferm
104
+ festival
105
+ film
106
+ fin
107
+ finn
108
+ firm
109
+ fiŝ
110
+ flank
111
+ flav
112
+ flor
113
+ flug
114
+ foj
115
+ forges
116
+ form
117
+ fort
118
+ fot
119
+ franc
120
+ frank
121
+ frat
122
+ fraz
123
+ fraŭl
124
+ frenez
125
+ fru
126
+ funkci
127
+ fuŝ
128
+ german
129
+ giĉet
130
+ grad
131
+ grand
132
+ gratul
133
+ grav
134
+ grup
135
+ gvid
136
+ ha
137
+ hav
138
+ hebre
139
+ hejm
140
+ help
141
+ hom
142
+ hor
143
+ hotel
144
+ ide
145
+ imag
146
+ infan
147
+ inform
148
+ instru
149
+ inteligent
150
+ interes
151
+ interpret
152
+ invit
153
+ ir
154
+ iran
155
+ ital
156
+ japan
157
+ jar
158
+ jun
159
+ kamp
160
+ kant
161
+ kapabl
162
+ kapt
163
+ kar
164
+ karot
165
+ kart
166
+ kaz
167
+ kaŝ
168
+ kelk
169
+ kilo
170
+ klar
171
+ klopod
172
+ knab
173
+ kolor
174
+ komenc
175
+ komision
176
+ komitat
177
+ kompetent
178
+ komplet
179
+ komplik
180
+ kompren
181
+ kon
182
+ koncept
183
+ koncern
184
+ kongres
185
+ konkret
186
+ konkurs
187
+ konsci
188
+ konsent
189
+ konserv
190
+ konsil
191
+ konsili
192
+ konsist
193
+ konstant
194
+ kontakt
195
+ kontrol
196
+ korb
197
+ korespond
198
+ kost
199
+ kovr
200
+ kred
201
+ kresk
202
+ kri
203
+ kruel
204
+ kuir
205
+ kuler
206
+ kultur
207
+ kunikl
208
+ kur
209
+ kutim
210
+ kvar
211
+ kvin
212
+ labor
213
+ lag
214
+ land
215
+ lanĉ
216
+ las
217
+ last
218
+ lav
219
+ leg
220
+ lern
221
+ lert
222
+ leter
223
+ lev
224
+ liber
225
+ libr
226
+ lig
227
+ lingv
228
+ lit
229
+ liter
230
+ literatur
231
+ lok
232
+ long
233
+ loĝ
234
+ lu
235
+ lud
236
+ man
237
+ manier
238
+ mank
239
+ manĝ
240
+ map
241
+ mar
242
+ marŝ
243
+ maten
244
+ material
245
+ maŝin
246
+ membr
247
+ memor
248
+ met
249
+ mez
250
+ miks
251
+ mil
252
+ minimum
253
+ minut
254
+ mir
255
+ moment
256
+ mon
257
+ monat
258
+ mond
259
+ mont
260
+ montr
261
+ mov
262
+ mult
263
+ naci
264
+ nask
265
+ naĝ
266
+ naŭ
267
+ neces
268
+ nederland
269
+ nepr
270
+ neŭtral
271
+ nivel
272
+ nokt
273
+ nom
274
+ normal
275
+ nov
276
+ nud
277
+ numer
278
+ ofert
279
+ oficial
280
+ oft
281
+ ok
282
+ okaz
283
+ okcident
284
+ okup
285
+ opini
286
+ ord
287
+ ordinar
288
+ organiz
289
+ orient
290
+ ov
291
+ pag
292
+ paper
293
+ pardon
294
+ parol
295
+ pas
296
+ patr
297
+ paĝ
298
+ paŝ
299
+ pend
300
+ pens
301
+ perd
302
+ perfekt
303
+ period
304
+ persik
305
+ person
306
+ pet
307
+ pied
308
+ plan
309
+ plank
310
+ plaĉ
311
+ plen
312
+ plor
313
+ plur
314
+ pokal
315
+ pom
316
+ pont
317
+ popular
318
+ port
319
+ postul
320
+ pov
321
+ prav
322
+ precip
323
+ preciz
324
+ prefer
325
+ preleg
326
+ premi
327
+ pren
328
+ prepar
329
+ pret
330
+ prez
331
+ prezent
332
+ prezid
333
+ princip
334
+ problem
335
+ produkt
336
+ profesi
337
+ profesor
338
+ program
339
+ proksim
340
+ propon
341
+ protest
342
+ protokol
343
+ prov
344
+ publik
345
+ punkt
346
+ pup
347
+ pur
348
+ rajt
349
+ rakont
350
+ rapid
351
+ raport
352
+ region
353
+ regul
354
+ reklam
355
+ rekomend
356
+ rekt
357
+ relativ
358
+ renkont
359
+ respond
360
+ rest
361
+ ricev
362
+ rid
363
+ rigard
364
+ rilat
365
+ rimark
366
+ river
367
+ riĉ
368
+ romp
369
+ rus
370
+ sal
371
+ salon
372
+ salt
373
+ salut
374
+ sam
375
+ sat
376
+ saĝ
377
+ saŭn
378
+ sci
379
+ scienc
380
+ seg
381
+ seks
382
+ sekv
383
+ semajn
384
+ senc
385
+ send
386
+ sent
387
+ sep
388
+ seri
389
+ serĉ
390
+ ses
391
+ sid
392
+ signif
393
+ simil
394
+ simpl
395
+ simul
396
+ sinjor
397
+ siren
398
+ sistem
399
+ situaci
400
+ skandinavi
401
+ skatol
402
+ ski
403
+ skrib
404
+ soci
405
+ sol
406
+ solv
407
+ sorĉ
408
+ spec
409
+ special
410
+ specif
411
+ spert
412
+ spinac
413
+ star
414
+ stat
415
+ statut
416
+ strat
417
+ struktur
418
+ stult
419
+ sud
420
+ sufiĉ
421
+ suk
422
+ sukces
423
+ supoz
424
+ supr
425
+ sved
426
+ svis
427
+ tabl
428
+ tabul
429
+ tag
430
+ tajp
431
+ task
432
+ te
433
+ teatr
434
+ tekst
435
+ teler
436
+ tem
437
+ temp
438
+ temperatur
439
+ ten
440
+ teren
441
+ terur
442
+ tim
443
+ traduk
444
+ trajn
445
+ trakt
446
+ tranĉ
447
+ tri
448
+ trink
449
+ trov
450
+ tuk
451
+ tut
452
+ tuŝ
453
+ universal
454
+ universitat
455
+ unu
456
+ urb
457
+ uson
458
+ util
459
+ uz
460
+ valor
461
+ varm
462
+ vast
463
+ ven
464
+ vend
465
+ venk
466
+ ver
467
+ verk
468
+ vesper
469
+ vest
470
+ veter
471
+ vetur
472
+ vid
473
+ vir
474
+ viv
475
+ vizit
476
+ voj
477
+ vojaĝ
478
+ vol
479
+ volv
480
+ vort
481
+ voĉ
482
+ vulp
483
+ zorg
484
+ ĉambr
485
+ ĉarm
486
+ ĉef
487
+ ĉeval
488
+ ĉin
489
+ ĝen
490
+ ĝeneral
491
+ ĝoj
492
+ ĝust
493
+ ĵet
494
+ ŝaf
495
+ ŝajn
496
+ ŝanc
497
+ ŝanĝ
498
+ ŝat
499
+ ŝip
500
+ ŝir
501
+ ŝlos
502
+ ŝtel
@@ -0,0 +1,45 @@
1
+ aĉ # contemptible
2
+ ad # frequent or continuous action (gerund)
3
+ aĵ # substance
4
+ an # member
5
+ ar # collection of
6
+ ĉj # male affectionate suffix
7
+ ebl # possibility
8
+ ec # abstract quality; -ness
9
+ eg # of great size or degree
10
+ ej # place
11
+ em # tending to
12
+ end # something that must be done
13
+ er # single, individual, unit
14
+ estr # leader
15
+ et # of small size or degree
16
+ id # offspring
17
+ ig # causing something to be
18
+ iĝ # becoming something
19
+ il # instrument; tool
20
+ ind # worthy of
21
+ ing # holder for an object
22
+ in # feminine suffix
23
+ ism # -ism
24
+ ist # -ist
25
+ nj # female affectionate suffix
26
+ obl # times (with numbers)
27
+ on # fraction (with numbers)
28
+ op # together (with numbers)
29
+ uj # container
30
+ ul # person possessing a certain quality
31
+ um # indefinite suffix
32
+
33
+ # passive forms
34
+ at
35
+ it
36
+ ot
37
+
38
+ # active forms
39
+ ant
40
+ int
41
+ ont
42
+
43
+
44
+ # Unofficial
45
+ iĉ # masculine form
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: disvortilo
3
+ Version: 0.1.0
4
+ Summary: Disvortilo is a simple tool that breaks Esperanto words into roots and affixes.
5
+ Author-email: Franz Weingartz <scaui0@gmx.net>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/LerniloEO/disvortilo
8
+ Project-URL: Repository, https://github.com/LerniloEO/disvortilo
9
+ Project-URL: Issues, https://github.com/LerniloEO/disvortilo/issues
10
+ Keywords: Esperanto,morphology,linguistics,NLP
11
+ Classifier: Topic :: Text Processing :: Linguistic
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Dynamic: license-file
18
+
19
+ # Disvortilo
20
+
21
+ Disvortilo is a simple tool that breaks Esperanto words into roots and affixes.
22
+
23
+ ## Getting Started
24
+
25
+ You can install Disvortilo from PyPI using pip:
26
+
27
+ ```shell
28
+ pip install disvortilo
29
+ ```
30
+
31
+ ## Examples
32
+
33
+ ```python
34
+ from disvortilo import Disvortilo
35
+
36
+ disvortilo = Disvortilo()
37
+
38
+ print(disvortilo.parse("malliberejo"))
39
+ # > [('mal', 'liber', 'ej', 'o')]
40
+
41
+ # some have more than one possible output
42
+ # like "Esperanto" which means "a hoping person"
43
+ print(disvortilo.parse("esperantistino"))
44
+ # > [('esper', 'ant', 'ist', 'in', 'o'), ('esperant', 'ist', 'in', 'o')]
45
+ ```
@@ -0,0 +1,10 @@
1
+ disvortilo/__init__.py,sha256=kk2bn3ukv7QPcOHoQ9N9vTT3X8FqRhzF4R_Z-Dtb73A,5158
2
+ disvortilo/full_words.txt,sha256=gqlLPTDPLnylGOWSQvEsxb_lOUUs1HtODUrMDbbSL94,578
3
+ disvortilo/prefixes.txt,sha256=jiVFM0haIRxwfL9wUrGqvcDI67E4lhgxxV9YrwLOe-w,199
4
+ disvortilo/roots.txt,sha256=07oJw43UbSGuVvpQu4jfkKj60zOyIhQOeaGcWuPziEg,2909
5
+ disvortilo/suffixes.txt,sha256=lwjCpl_4GN0G295P3W1o0abtUgU34o7sTpHyOpN8HTo,845
6
+ disvortilo-0.1.0.dist-info/licenses/LICENSE,sha256=3J4UnGzcrGIf2Lc5evXPrVJokpAYNGuvHYAlfzsjOmU,1072
7
+ disvortilo-0.1.0.dist-info/METADATA,sha256=8SY3i6t5WxkD2fsW9S26oxDyf7IQOM7d1NOdWuC1ezU,1281
8
+ disvortilo-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
+ disvortilo-0.1.0.dist-info/top_level.txt,sha256=K05C9mVwZZRGYnjvcJwsU8cg6L3fjrwE73ydJUCu5AA,11
10
+ disvortilo-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Franz Weingartz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ disvortilo