semantic-compressor 2.0__py3-none-any.whl → 2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressor/resources/nltk_data/tokenizers/punkt_tab/README +98 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +52789 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +54 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +211 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/danish/collocations.tab +101 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +53913 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +64 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +99 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +37 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +32208 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +54 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +156 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/english/collocations.tab +37 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +20366 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +39 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +48 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +100 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +68544 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +89 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +81 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +167 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +79765 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +86 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +61 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/french/collocations.tab +18 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +26726 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +48 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +71 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/german/collocations.tab +28 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +60260 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +107 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +100 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/greek/collocations.tab +7 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +29624 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +54 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +125 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/italian/collocations.tab +6 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +29929 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +40 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +285 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +153 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +10520 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +14 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +106 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/collocations.tab +54 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/ortho_context.tab +54125 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/norwegian/sent_starters.txt +63 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/polish/abbrev_types.txt +225 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/polish/collocations.tab +57 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/polish/ortho_context.tab +81425 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/polish/sent_starters.txt +71 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/abbrev_types.txt +72 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/collocations.tab +5 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/ortho_context.tab +30167 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/portuguese/sent_starters.txt +40 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/russian/abbrev_types.txt +1989 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/russian/collocations.tab +0 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/russian/ortho_context.tab +1 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/russian/sent_starters.txt +0 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/abbrev_types.txt +73 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/collocations.tab +74 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/ortho_context.tab +35434 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/slovene/sent_starters.txt +58 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/abbrev_types.txt +66 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/collocations.tab +7 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/ortho_context.tab +27443 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/spanish/sent_starters.txt +46 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/abbrev_types.txt +39 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/collocations.tab +8 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/ortho_context.tab +44485 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/swedish/sent_starters.txt +49 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/abbrev_types.txt +67 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/collocations.tab +14 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/ortho_context.tab +45926 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab/turkish/sent_starters.txt +87 -0
- compressor/resources/nltk_data/tokenizers/punkt_tab.zip +0 -0
- compressor/semantic.py +1 -1
- {semantic_compressor-2.0.dist-info → semantic_compressor-2.2.dist-info}/METADATA +1 -1
- {semantic_compressor-2.0.dist-info → semantic_compressor-2.2.dist-info}/RECORD +84 -6
- {semantic_compressor-2.0.dist-info → semantic_compressor-2.2.dist-info}/LICENSE +0 -0
- {semantic_compressor-2.0.dist-info → semantic_compressor-2.2.dist-info}/WHEEL +0 -0
- {semantic_compressor-2.0.dist-info → semantic_compressor-2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
het
|
2
|
+
daardoor
|
3
|
+
de
|
4
|
+
er
|
5
|
+
hoewel
|
6
|
+
wat
|
7
|
+
urlings
|
8
|
+
na
|
9
|
+
ze
|
10
|
+
alleen
|
11
|
+
dat
|
12
|
+
ik
|
13
|
+
pijls
|
14
|
+
wie
|
15
|
+
daarna
|
16
|
+
foto
|
17
|
+
als
|
18
|
+
boer
|
19
|
+
hammes
|
20
|
+
verder
|
21
|
+
ook
|
22
|
+
evers
|
23
|
+
vandaar
|
24
|
+
toen
|
25
|
+
we
|
26
|
+
langenberg
|
27
|
+
naast
|
28
|
+
want
|
29
|
+
in
|
30
|
+
wij
|
31
|
+
zo
|
32
|
+
hendrikx
|
33
|
+
daar
|
34
|
+
crouzen
|
35
|
+
dit
|
36
|
+
daarnaast
|
37
|
+
anp
|
38
|
+
zij
|
39
|
+
behalve
|
40
|
+
waarom
|
41
|
+
daarom
|
42
|
+
bovendien
|
43
|
+
hij
|
44
|
+
daarbij
|
45
|
+
nee
|
46
|
+
volgens
|
47
|
+
daarmee
|
48
|
+
bukkems
|
49
|
+
dvnl
|
50
|
+
eén
|
51
|
+
pas
|
52
|
+
tijdens
|
53
|
+
vooral
|
54
|
+
maar
|
@@ -0,0 +1,156 @@
|
|
1
|
+
ct
|
2
|
+
m.j
|
3
|
+
t
|
4
|
+
a.c
|
5
|
+
n.h
|
6
|
+
ms
|
7
|
+
p.a.m
|
8
|
+
dr
|
9
|
+
pa
|
10
|
+
p.m
|
11
|
+
u.k
|
12
|
+
st
|
13
|
+
dec
|
14
|
+
u.s.a
|
15
|
+
lt
|
16
|
+
g.k
|
17
|
+
adm
|
18
|
+
p
|
19
|
+
h.m
|
20
|
+
ga
|
21
|
+
tenn
|
22
|
+
yr
|
23
|
+
sen
|
24
|
+
n.c
|
25
|
+
j.j
|
26
|
+
d.h
|
27
|
+
s.g
|
28
|
+
inc
|
29
|
+
vs
|
30
|
+
s.p.a
|
31
|
+
a.t
|
32
|
+
n
|
33
|
+
feb
|
34
|
+
sr
|
35
|
+
jan
|
36
|
+
s.a.y
|
37
|
+
n.y
|
38
|
+
col
|
39
|
+
g.f
|
40
|
+
c.o.m.b
|
41
|
+
d
|
42
|
+
ft
|
43
|
+
va
|
44
|
+
r.k
|
45
|
+
e.f
|
46
|
+
chg
|
47
|
+
r.i
|
48
|
+
a.g
|
49
|
+
minn
|
50
|
+
a.h
|
51
|
+
k
|
52
|
+
n.j
|
53
|
+
m
|
54
|
+
l.f
|
55
|
+
f.j
|
56
|
+
gen
|
57
|
+
i.m.s
|
58
|
+
s.a
|
59
|
+
aug
|
60
|
+
j.p
|
61
|
+
okla
|
62
|
+
m.d.c
|
63
|
+
ltd
|
64
|
+
oct
|
65
|
+
s
|
66
|
+
vt
|
67
|
+
r.a
|
68
|
+
j.c
|
69
|
+
ariz
|
70
|
+
w.w
|
71
|
+
b.v
|
72
|
+
ore
|
73
|
+
h
|
74
|
+
w.r
|
75
|
+
e.h
|
76
|
+
mrs
|
77
|
+
cie
|
78
|
+
corp
|
79
|
+
w
|
80
|
+
n.v
|
81
|
+
a.d
|
82
|
+
r.j
|
83
|
+
ok
|
84
|
+
. .
|
85
|
+
e.m
|
86
|
+
w.c
|
87
|
+
ill
|
88
|
+
nov
|
89
|
+
u.s
|
90
|
+
prof
|
91
|
+
conn
|
92
|
+
u.s.s.r
|
93
|
+
mg
|
94
|
+
f.g
|
95
|
+
ph.d
|
96
|
+
g
|
97
|
+
calif
|
98
|
+
messrs
|
99
|
+
h.f
|
100
|
+
wash
|
101
|
+
tues
|
102
|
+
sw
|
103
|
+
bros
|
104
|
+
u.n
|
105
|
+
l
|
106
|
+
wis
|
107
|
+
mr
|
108
|
+
sep
|
109
|
+
d.c
|
110
|
+
ave
|
111
|
+
e.l
|
112
|
+
co
|
113
|
+
s.s
|
114
|
+
reps
|
115
|
+
c
|
116
|
+
r.t
|
117
|
+
h.c
|
118
|
+
r
|
119
|
+
wed
|
120
|
+
a.s
|
121
|
+
v
|
122
|
+
fla
|
123
|
+
jr
|
124
|
+
r.h
|
125
|
+
c.v
|
126
|
+
m.b.a
|
127
|
+
rep
|
128
|
+
a.a
|
129
|
+
e
|
130
|
+
c.i.t
|
131
|
+
l.a
|
132
|
+
b.f
|
133
|
+
j.b
|
134
|
+
d.w
|
135
|
+
j.k
|
136
|
+
ala
|
137
|
+
f
|
138
|
+
w.va
|
139
|
+
sept
|
140
|
+
mich
|
141
|
+
n.m
|
142
|
+
j.r
|
143
|
+
l.p
|
144
|
+
s.c
|
145
|
+
colo
|
146
|
+
fri
|
147
|
+
a.m
|
148
|
+
g.d
|
149
|
+
kan
|
150
|
+
maj
|
151
|
+
ky
|
152
|
+
a.m.e
|
153
|
+
n.d
|
154
|
+
t.j
|
155
|
+
cos
|
156
|
+
nev
|
@@ -0,0 +1,37 @@
|
|
1
|
+
##number## international
|
2
|
+
##number## rj
|
3
|
+
##number## commodities
|
4
|
+
##number## cooper
|
5
|
+
b stewart
|
6
|
+
##number## genentech
|
7
|
+
##number## wedgestone
|
8
|
+
i toussie
|
9
|
+
##number## pepper
|
10
|
+
j fialka
|
11
|
+
o ludcke
|
12
|
+
##number## insider
|
13
|
+
##number## aes
|
14
|
+
i magnin
|
15
|
+
##number## credit
|
16
|
+
##number## corrections
|
17
|
+
##number## financing
|
18
|
+
##number## henley
|
19
|
+
##number## business
|
20
|
+
##number## pay-fone
|
21
|
+
b wigton
|
22
|
+
b edelman
|
23
|
+
b levine
|
24
|
+
##number## leisure
|
25
|
+
b smith
|
26
|
+
j walter
|
27
|
+
##number## pegasus
|
28
|
+
##number## dividend
|
29
|
+
j aron
|
30
|
+
##number## review
|
31
|
+
##number## abreast
|
32
|
+
##number## who
|
33
|
+
##number## letters
|
34
|
+
##number## colgate
|
35
|
+
##number## cbot
|
36
|
+
##number## notable
|
37
|
+
##number## zimmer
|