python-ucto 0.6.10__cp313-cp313-musllinux_1_1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ #embedsignature=True
2
+ #*****************************
3
+ # Python-ucto
4
+ # by Maarten van Gompel
5
+ # Centre for Language Studies
6
+ # Radboud University Nijmegen
7
+ #
8
+ # Licensed under GPLv3
9
+ #****************************/
10
+
11
+ from libcpp.string cimport string
12
+ from libcpp cimport bool
13
+ from libcpp.vector cimport vector
14
+ from cython.operator cimport dereference as deref, preincrement as inc
15
+ from cython import address
16
+ from libc.stdint cimport *
17
+ from libcpp.utility cimport pair
18
+ import os.path
19
+ import sys
20
+ cimport libfolia_classes
21
+ cimport ucto_classes
22
+
23
+ UCTODATAVERSION = "0.11"
24
+
25
+ class TokenRole:
26
+ NOROLE = 0
27
+ NOSPACE = 1
28
+ BEGINOFSENTENCE = 2
29
+ ENDOFSENTENCE = 4
30
+ NEWPARAGRAPH = 8
31
+ BEGINQUOTE = 16
32
+ ENDQUOTE = 32
33
+ TEMPENDOFSENTENCE = 64
34
+
35
+ class Token:
36
+ def __init__(self, text, tokentype, role):
37
+ self.text = text
38
+ self.tokentype = tokentype
39
+ self.role = role
40
+
41
+ def __str__(self):
42
+ return self.text
43
+
44
+ def type(self):
45
+ return self.tokentype
46
+
47
+ def isendofsentence(self):
48
+ return self.role & TokenRole.ENDOFSENTENCE
49
+
50
+ def iseos(self): #for lazy people
51
+ return self.role & TokenRole.ENDOFSENTENCE
52
+
53
+ def isbeginofsentence(self):
54
+ return self.role & TokenRole.BEGINOFSENTENCE
55
+
56
+ def isnewparagraph(self):
57
+ return self.role & TokenRole.NEWPARAGRAPH
58
+
59
+ def isbeginofquote(self):
60
+ return self.role & TokenRole.BEGINQUOTE
61
+
62
+ def isendofquote(self):
63
+ return self.role & TokenRole.ENDQUOTE
64
+
65
+ def nospace(self):
66
+ return self.role & TokenRole.NOSPACE
67
+
68
+
69
+ cdef class Tokenizer:
70
+ cdef ucto_classes.TokenizerClass tok
71
+
72
+ def __init__(self, filename, **kwargs):
73
+ self.tok.init(filename.encode('utf-8'))
74
+ for arg, value in kwargs.items():
75
+ if arg == 'lowercase':
76
+ self.tok.setLowercase(value is True)
77
+ elif arg == 'uppercase':
78
+ self.tok.setUppercase(value is True)
79
+ elif arg == 'sentencedetection':
80
+ sys.stderr.write("[python-ucto] Argument 'sentencedetection' is deprecated and has no effect, it is always enabled.\n")
81
+ elif arg == 'paragraphdetection':
82
+ self.tok.setParagraphDetection(value is True)
83
+ elif arg == 'quotedetection':
84
+ self.tok.setQuoteDetection(value is True)
85
+ elif arg == 'sentenceperlineinput':
86
+ self.tok.setSentencePerLineInput(value is True)
87
+ elif arg == 'sentenceperlineoutput':
88
+ self.tok.setSentencePerLineOutput(value is True)
89
+ elif arg == 'xmlinput' or arg == 'foliainput':
90
+ self.tok.setXMLInput(value is True)
91
+ elif arg == 'xmloutput' or arg == 'foliaoutput':
92
+ if 'docid' in kwargs:
93
+ docid = kwargs['docid']
94
+ else:
95
+ docid = "untitled"
96
+ self.tok.setXMLOutput(value is True, docid.encode('utf-8'))
97
+ elif arg == 'debug':
98
+ self.tok.setDebug(int(value))
99
+ elif arg == 'docid':
100
+ pass
101
+ else:
102
+ raise ValueError("No such keyword argument: " + arg)
103
+
104
+ def tokenize(self, str inputfile, str outputfile):
105
+ """Run ucto from inputfile to outputfile (like command line tool)"""
106
+ self.tok.tokenize(inputfile.encode('utf-8'), outputfile.encode('utf-8'))
107
+
108
+
109
+
110
+ def process(self, str line):
111
+ """Feed text to the tokeniser. This needs not be a single line."""
112
+ self.tok.tokenizeLine(line.encode('utf-8'))
113
+
114
+ def sentences(self):
115
+ cdef vector[string] results = self.tok.getUTF8Sentences()
116
+ cdef vector[string].iterator it = results.begin()
117
+ cdef int sentencecount = len(results)
118
+ while it != results.end():
119
+ yield str(deref(it), 'utf-8').replace("<utt>",'')
120
+ inc(it)
121
+
122
+ def lowercase(self):
123
+ return self.tok.getLowercase()
124
+
125
+ def uppercase(self):
126
+ return self.tok.getLowercase()
127
+
128
+ def __iter__(self):
129
+ cdef vector[ucto_classes.Token] v
130
+ cdef vector[ucto_classes.Token].iterator it
131
+ while True:
132
+ v = self.tok.popSentence()
133
+ if v.empty():
134
+ break
135
+ it = v.begin()
136
+ while it != v.end():
137
+ tokentext = str(deref(it).texttostring(), 'utf-8')
138
+ tokentype = str(deref(it).typetostring(), 'utf-8')
139
+ role = deref(it).role
140
+ if self.lowercase():
141
+ tokentext = tokentext.lower()
142
+ elif self.uppercase():
143
+ tokentext = tokentext.upper()
144
+ yield Token(tokentext, tokentype, role)
145
+ inc(it)
146
+
147
+
148
+ def localpath():
149
+ xdg_config_dir = os.environ.get("XDG_CONFIG_HOME", os.path.join(os.environ.get("HOME",""), ".config"))
150
+ return os.environ.get("UCTODATAPATH", os.path.join(xdg_config_dir,"ucto") )
151
+
152
+ def installdata(targetdir=None, version=UCTODATAVERSION):
153
+ if targetdir is None:
154
+ targetdir = localpath()
155
+ else:
156
+ targetdir = os.path.join(targetdir,"ucto")
157
+ if os.path.exists(targetdir):
158
+ print(f"Uctodata configuration directory already exists: {targetdir}, refusing to overwrite, please remove it first if you want to install all data anew.", file=sys.stderr)
159
+ else:
160
+ tmpdir=os.environ.get("TMPDIR","/tmp")
161
+ if os.system(f"cd {tmpdir} && mkdir -p {targetdir} && wget -O uctodata.tar.gz https://github.com/LanguageMachines/uctodata/releases/download/v{version}/uctodata-{version}.tar.gz && tar -xzf uctodata.tar.gz && cd uctodata-{version} && mv config/* {targetdir}/ && cd .. && rm -Rf uctodata-{version} && rm -Rf uctodata.tar.gz") != 0:
162
+ raise Exception("Installation failed")
163
+ print(f"Installation of uctodata {version} complete", file=sys.stderr)
164
+ if os.path.isdir("/usr/share/libexttextcat"):
165
+ if os.system(f"cd {targetdir} && wget -O textcat.cfg https://raw.githubusercontent.com/LanguageMachines/ucto/master/config/textcat.cfg") != 0:
166
+ raise Exception("Installation of textcat.cfg failed")
167
+ else:
168
+ print("Language detection will not be available unless you install libexttextcat and rerun installdata()", file=sys.stderr)
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-ucto
3
+ Version: 0.6.10
4
+ Summary: This is a Python binding to the tokenizer Ucto. Tokenisation is one of the first step in almost any Natural Language Processing task, yet it is not always as trivial a task as it appears to be. This binding makes the power of the ucto tokeniser available to Python. Ucto itself is a regular-expression based, extensible, and advanced tokeniser written in C++ (https://languagemachines.github.io/ucto).
5
+ Home-page: https://github.com/proycon/python-ucto
6
+ Author: Maarten van Gompel
7
+ Author-email: proycon@anaproy.nl
8
+ License: GPL-3.0-only
9
+ Keywords: tokenizer tokenization tokeniser tokenisation nlp computational_linguistics ucto
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Topic :: Text Processing :: Linguistic
12
+ Classifier: Programming Language :: Cython
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Operating System :: POSIX
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Science/Research
17
+ Requires: ucto (>=0.36)
18
+ Requires-Dist: Cython
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: home-page
23
+ Dynamic: keywords
24
+ Dynamic: license
25
+ Dynamic: requires
26
+ Dynamic: requires-dist
27
+ Dynamic: summary
@@ -0,0 +1,20 @@
1
+ ucto.cpython-313-x86_64-linux-musl.so,sha256=sU2vnCSzSW0rAa1YIlkVM8fLpFagQDhcL9CM9MD8j9I,2724945
2
+ python_ucto-0.6.10.dist-info/METADATA,sha256=h3BR4GkyhTa1ijcp0ai7dmuzA3w7hU1Si596CyYnBpg,1236
3
+ python_ucto-0.6.10.dist-info/WHEEL,sha256=TIU9MrPwV2TTDER1sJLMhWI27WMvHL-X55XCyK-2nY8,113
4
+ python_ucto-0.6.10.dist-info/top_level.txt,sha256=7zjSe25cKXiD3nFVCqHOi-tWjWL3N2nPFaVCsq3V6mw,5
5
+ python_ucto-0.6.10.dist-info/RECORD,,
6
+ python_ucto-0.6.10.data/data/sources/ucto_wrapper.pyx,sha256=hsussZVPOXyikT9gtnBjR_aE-eWZhdQnvpopiM5XJRU,6392
7
+ python_ucto.libs/liblzma-712ecf44.so.5.2.5,sha256=7kvrZ6mnhB-8_rW2_AEv7H3i3sRDIqDUJaRIzu2lAfo,142889
8
+ python_ucto.libs/libgomp-cd951527.so.1.0.0,sha256=U2zda7o5P6DwCQHNyzt7Hw13pk9wKHfh-c4ZgCzbfsU,187009
9
+ python_ucto.libs/libfolia-cd722aa4.so.22.0.0,sha256=OW0UC9lUraf-mFE7wT3wz0mG2R7erf6bmKSJmKUR29M,36662377
10
+ python_ucto.libs/libicudata-33bdc345.so.67.1,sha256=W7tSBH0i_bTl4D-LWm6PfW2ga60jf0bD4w951ILuNmg,28405913
11
+ python_ucto.libs/libicuio-a3ad3dd8.so.67.1,sha256=P1Xrzbhf6OVvKWhDd8-XUb5xJgJ8wHuADjvzYbpn76E,60353
12
+ python_ucto.libs/libbz2-e55174b0.so.1.0.8,sha256=nLQu8p2AuUOP8XKyFjOxRcuzbTMhNmeEMKDgu4ZqMzI,62697
13
+ python_ucto.libs/libexttextcat-2-aa3651c4.0.so.0.0.0,sha256=7J4Eo2SdMzAWnFwgHlqIUEPQSF2Z9TUbIUxdyA6Ho6Y,21481
14
+ python_ucto.libs/libucto-8ce70bd9.so.6.0.0,sha256=_tt3DaCNeD229Z4PgRh04EYgvv9itl5Ka683YGT1174,8417425
15
+ python_ucto.libs/libicuuc-98673c7b.so.67.1,sha256=QFgJGr38T8HZa5tMm-WCN1Oh_NJ4W7gxmcxtdYBepZc,2059401
16
+ python_ucto.libs/libgcc_s-a04fdf82.so.1,sha256=YxqJNaesQMhDswHEQpXsiLnVvMBBbYO6KYMDZFPWKSM,81257
17
+ python_ucto.libs/libxml2-0d3d92a0.so.2.9.14,sha256=G5uW2wHM6q8ILK1oYTAsEpi2WhHvESMja4F6NfuL7Ns,1306513
18
+ python_ucto.libs/libticcutils-13aac825.so.10.0.0,sha256=PraCNTuv2cMCaW-Bk13DCfXQq7XdP2qs4HJI7OroDTw,10044985
19
+ python_ucto.libs/libstdc++-a9383cce.so.6.0.28,sha256=Wy9UCdwS1rwI9GU5e7qE61S0AkRqqwti1q_adWSs-Rk,2447393
20
+ python_ucto.libs/libicui18n-17bc7491.so.67.1,sha256=l5BBF2b4GUYXvebfa2BWzCtR1JTLzeuzTXP3HeK-xf4,3750521
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp313-cp313-musllinux_1_1_x86_64
5
+
@@ -0,0 +1 @@
1
+ ucto
Binary file
Binary file