kreuzberg 3.16.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. kreuzberg/__init__.py +2 -0
  2. kreuzberg/_config.py +8 -9
  3. kreuzberg/_extractors/_base.py +0 -46
  4. kreuzberg/_extractors/_html.py +1 -1
  5. kreuzberg/_extractors/_pandoc.py +2 -2
  6. kreuzberg/_extractors/_pdf.py +4 -4
  7. kreuzberg/_gmft.py +2 -2
  8. kreuzberg/_mcp/server.py +1 -1
  9. kreuzberg/_mime_types.py +1 -1
  10. kreuzberg/_ocr/_easyocr.py +4 -9
  11. kreuzberg/_ocr/_paddleocr.py +1 -1
  12. kreuzberg/_ocr/_tesseract.py +15 -25
  13. kreuzberg/_token_reduction/__init__.py +11 -0
  14. kreuzberg/_token_reduction/_reducer.py +439 -0
  15. kreuzberg/_token_reduction/_stopwords.py +116 -0
  16. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  17. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  18. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  19. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  20. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  21. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  22. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  23. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  24. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  25. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  26. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  27. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  28. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  29. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  30. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  31. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  32. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  33. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  34. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  35. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  36. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  37. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  38. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  39. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  40. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  41. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  42. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  43. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  44. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  45. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  46. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  47. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  48. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  49. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  50. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  51. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  52. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  53. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  54. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  55. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  56. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  57. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  58. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  59. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  60. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  61. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  62. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  63. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  64. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  65. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  66. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  67. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  68. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  69. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  70. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  71. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  72. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  73. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  74. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  75. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  76. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  77. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  78. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  79. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  80. kreuzberg/_types.py +35 -3
  81. kreuzberg/_utils/_image_preprocessing.py +1 -1
  82. kreuzberg/_utils/_ref.py +14 -6
  83. kreuzberg/exceptions.py +0 -1
  84. kreuzberg/extraction.py +25 -9
  85. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +4 -3
  86. kreuzberg-3.17.0.dist-info/RECORD +128 -0
  87. kreuzberg-3.16.0.dist-info/RECORD +0 -61
  88. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
  89. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
  90. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,196 @@
1
+ [
2
+ "אבל",
3
+ "או",
4
+ "אולי",
5
+ "אותה",
6
+ "אותו",
7
+ "אותי",
8
+ "אותך",
9
+ "אותם",
10
+ "אותן",
11
+ "אותנו",
12
+ "אז",
13
+ "אחר",
14
+ "אחרות",
15
+ "אחרי",
16
+ "אחריכן",
17
+ "אחרים",
18
+ "אחרת",
19
+ "אי",
20
+ "איזה",
21
+ "איך",
22
+ "אין",
23
+ "איפה",
24
+ "איתה",
25
+ "איתו",
26
+ "איתי",
27
+ "איתך",
28
+ "איתכם",
29
+ "איתכן",
30
+ "איתם",
31
+ "איתן",
32
+ "איתנו",
33
+ "אך",
34
+ "אל",
35
+ "אלה",
36
+ "אלו",
37
+ "אם",
38
+ "אנחנו",
39
+ "אני",
40
+ "אס",
41
+ "אף",
42
+ "אצל",
43
+ "אשר",
44
+ "את",
45
+ "אתה",
46
+ "אתכם",
47
+ "אתכן",
48
+ "אתם",
49
+ "אתן",
50
+ "באיזומידה",
51
+ "באמצע",
52
+ "באמצעות",
53
+ "בגלל",
54
+ "בין",
55
+ "בלי",
56
+ "במידה",
57
+ "במקוםשבו",
58
+ "ברם",
59
+ "בשביל",
60
+ "בשעהש",
61
+ "בתוך",
62
+ "גם",
63
+ "דרך",
64
+ "הוא",
65
+ "היא",
66
+ "היה",
67
+ "היכן",
68
+ "היתה",
69
+ "היתי",
70
+ "הם",
71
+ "הן",
72
+ "הנה",
73
+ "הסיבהשבגללה",
74
+ "הרי",
75
+ "ואילו",
76
+ "ואת",
77
+ "זאת",
78
+ "זה",
79
+ "זות",
80
+ "יהיה",
81
+ "יוכל",
82
+ "יוכלו",
83
+ "יותרמדי",
84
+ "יכול",
85
+ "יכולה",
86
+ "יכולות",
87
+ "יכולים",
88
+ "יכל",
89
+ "יכלה",
90
+ "יכלו",
91
+ "יש",
92
+ "כאן",
93
+ "כאשר",
94
+ "כולם",
95
+ "כולן",
96
+ "כזה",
97
+ "כי",
98
+ "כיצד",
99
+ "כך",
100
+ "ככה",
101
+ "כל",
102
+ "כלל",
103
+ "כמו",
104
+ "כן",
105
+ "כפי",
106
+ "כש",
107
+ "לא",
108
+ "לאו",
109
+ "לאיזותכלית",
110
+ "לאן",
111
+ "לבין",
112
+ "לה",
113
+ "להיות",
114
+ "להם",
115
+ "להן",
116
+ "לו",
117
+ "לי",
118
+ "לכם",
119
+ "לכן",
120
+ "למה",
121
+ "למטה",
122
+ "למעלה",
123
+ "למקוםשבו",
124
+ "למרות",
125
+ "לנו",
126
+ "לעבר",
127
+ "לעיכן",
128
+ "לפיכך",
129
+ "לפני",
130
+ "מאד",
131
+ "מאחורי",
132
+ "מאיזוסיבה",
133
+ "מאין",
134
+ "מאיפה",
135
+ "מבלי",
136
+ "מבעד",
137
+ "מדוע",
138
+ "מה",
139
+ "מהיכן",
140
+ "מול",
141
+ "מחוץ",
142
+ "מי",
143
+ "מכאן",
144
+ "מכיוון",
145
+ "מלבד",
146
+ "מן",
147
+ "מנין",
148
+ "מסוגל",
149
+ "מעט",
150
+ "מעטים",
151
+ "מעל",
152
+ "מצד",
153
+ "מקוםבו",
154
+ "מתחת",
155
+ "מתי",
156
+ "נגד",
157
+ "נגר",
158
+ "נו",
159
+ "עד",
160
+ "עז",
161
+ "על",
162
+ "עלי",
163
+ "עליה",
164
+ "עליהם",
165
+ "עליהן",
166
+ "עליו",
167
+ "עליך",
168
+ "עליכם",
169
+ "עלינו",
170
+ "עם",
171
+ "עצמה",
172
+ "עצמהם",
173
+ "עצמהן",
174
+ "עצמו",
175
+ "עצמי",
176
+ "עצמם",
177
+ "עצמן",
178
+ "עצמנו",
179
+ "פה",
180
+ "רק",
181
+ "שוב",
182
+ "של",
183
+ "שלה",
184
+ "שלהם",
185
+ "שלהן",
186
+ "שלו",
187
+ "שלי",
188
+ "שלך",
189
+ "שלכה",
190
+ "שלכם",
191
+ "שלכן",
192
+ "שלנו",
193
+ "שם",
194
+ "תהיה",
195
+ "תחת"
196
+ ]
@@ -0,0 +1,227 @@
1
+ [
2
+ "अंदर",
3
+ "अत",
4
+ "अदि",
5
+ "अप",
6
+ "अपना",
7
+ "अपनि",
8
+ "अपनी",
9
+ "अपने",
10
+ "अभि",
11
+ "अभी",
12
+ "आदि",
13
+ "आप",
14
+ "इंहिं",
15
+ "इंहें",
16
+ "इंहों",
17
+ "इतयादि",
18
+ "इत्यादि",
19
+ "इन",
20
+ "इनका",
21
+ "इन्हीं",
22
+ "इन्हें",
23
+ "इन्हों",
24
+ "इस",
25
+ "इसका",
26
+ "इसकि",
27
+ "इसकी",
28
+ "इसके",
29
+ "इसमें",
30
+ "इसि",
31
+ "इसी",
32
+ "इसे",
33
+ "उंहिं",
34
+ "उंहें",
35
+ "उंहों",
36
+ "उन",
37
+ "उनका",
38
+ "उनकि",
39
+ "उनकी",
40
+ "उनके",
41
+ "उनको",
42
+ "उन्हीं",
43
+ "उन्हें",
44
+ "उन्हों",
45
+ "उस",
46
+ "उसके",
47
+ "उसि",
48
+ "उसी",
49
+ "उसे",
50
+ "एक",
51
+ "एवं",
52
+ "एस",
53
+ "एसे",
54
+ "ऐसे",
55
+ "ओर",
56
+ "और",
57
+ "कइ",
58
+ "कई",
59
+ "कर",
60
+ "करता",
61
+ "करते",
62
+ "करना",
63
+ "करने",
64
+ "करें",
65
+ "कहते",
66
+ "कहा",
67
+ "का",
68
+ "काफि",
69
+ "काफ़ी",
70
+ "कि",
71
+ "किंहें",
72
+ "किंहों",
73
+ "कितना",
74
+ "किन्हें",
75
+ "किन्हों",
76
+ "किया",
77
+ "किर",
78
+ "किस",
79
+ "किसि",
80
+ "किसी",
81
+ "किसे",
82
+ "की",
83
+ "कुछ",
84
+ "कुल",
85
+ "के",
86
+ "को",
87
+ "कोइ",
88
+ "कोई",
89
+ "कोन",
90
+ "कोनसा",
91
+ "कौन",
92
+ "कौनसा",
93
+ "गया",
94
+ "घर",
95
+ "जब",
96
+ "जहाँ",
97
+ "जहां",
98
+ "जा",
99
+ "जिंहें",
100
+ "जिंहों",
101
+ "जितना",
102
+ "जिधर",
103
+ "जिन",
104
+ "जिन्हें",
105
+ "जिन्हों",
106
+ "जिस",
107
+ "जिसे",
108
+ "जीधर",
109
+ "जेसा",
110
+ "जेसे",
111
+ "जैसा",
112
+ "जैसे",
113
+ "जो",
114
+ "तक",
115
+ "तब",
116
+ "तरह",
117
+ "तिंहें",
118
+ "तिंहों",
119
+ "तिन",
120
+ "तिन्हें",
121
+ "तिन्हों",
122
+ "तिस",
123
+ "तिसे",
124
+ "तो",
125
+ "था",
126
+ "थि",
127
+ "थी",
128
+ "थे",
129
+ "दबारा",
130
+ "दवारा",
131
+ "दिया",
132
+ "दुसरा",
133
+ "दुसरे",
134
+ "दूसरे",
135
+ "दो",
136
+ "द्वारा",
137
+ "न",
138
+ "नहिं",
139
+ "नहीं",
140
+ "ना",
141
+ "निचे",
142
+ "निहायत",
143
+ "नीचे",
144
+ "ने",
145
+ "पर",
146
+ "पहले",
147
+ "पुरा",
148
+ "पूरा",
149
+ "पे",
150
+ "फिर",
151
+ "बनि",
152
+ "बनी",
153
+ "बहि",
154
+ "बही",
155
+ "बहुत",
156
+ "बाद",
157
+ "बाला",
158
+ "बिलकुल",
159
+ "भि",
160
+ "भितर",
161
+ "भी",
162
+ "भीतर",
163
+ "मगर",
164
+ "मानो",
165
+ "मे",
166
+ "में",
167
+ "यदि",
168
+ "यह",
169
+ "यहाँ",
170
+ "यहां",
171
+ "यहि",
172
+ "यही",
173
+ "या",
174
+ "यिह",
175
+ "ये",
176
+ "रखें",
177
+ "रवासा",
178
+ "रहा",
179
+ "रहे",
180
+ "ऱ्वासा",
181
+ "लिए",
182
+ "लिये",
183
+ "लेकिन",
184
+ "व",
185
+ "वगेरह",
186
+ "वरग",
187
+ "वर्ग",
188
+ "वह",
189
+ "वहाँ",
190
+ "वहां",
191
+ "वहिं",
192
+ "वहीं",
193
+ "वाले",
194
+ "वुह",
195
+ "वे",
196
+ "वग़ैरह",
197
+ "संग",
198
+ "सकता",
199
+ "सकते",
200
+ "सबसे",
201
+ "सभि",
202
+ "सभी",
203
+ "साथ",
204
+ "साबुत",
205
+ "साभ",
206
+ "सारा",
207
+ "से",
208
+ "सो",
209
+ "हि",
210
+ "ही",
211
+ "हुअ",
212
+ "हुआ",
213
+ "हुइ",
214
+ "हुई",
215
+ "हुए",
216
+ "हे",
217
+ "हें",
218
+ "है",
219
+ "हैं",
220
+ "हो",
221
+ "होता",
222
+ "होति",
223
+ "होती",
224
+ "होते",
225
+ "होना",
226
+ "होने"
227
+ ]
@@ -0,0 +1,181 @@
1
+ [
2
+ "a",
3
+ "ako",
4
+ "ali",
5
+ "bi",
6
+ "bih",
7
+ "bila",
8
+ "bili",
9
+ "bilo",
10
+ "bio",
11
+ "bismo",
12
+ "biste",
13
+ "biti",
14
+ "bumo",
15
+ "da",
16
+ "do",
17
+ "duž",
18
+ "ga",
19
+ "hoće",
20
+ "hoćemo",
21
+ "hoćete",
22
+ "hoćeš",
23
+ "hoću",
24
+ "i",
25
+ "iako",
26
+ "ih",
27
+ "ili",
28
+ "iz",
29
+ "ja",
30
+ "je",
31
+ "jedna",
32
+ "jedne",
33
+ "jedno",
34
+ "jer",
35
+ "jesam",
36
+ "jesi",
37
+ "jesmo",
38
+ "jest",
39
+ "jeste",
40
+ "jesu",
41
+ "jim",
42
+ "joj",
43
+ "još",
44
+ "ju",
45
+ "kada",
46
+ "kako",
47
+ "kao",
48
+ "koja",
49
+ "koje",
50
+ "koji",
51
+ "kojima",
52
+ "koju",
53
+ "kroz",
54
+ "li",
55
+ "me",
56
+ "mene",
57
+ "meni",
58
+ "mi",
59
+ "mimo",
60
+ "moj",
61
+ "moja",
62
+ "moje",
63
+ "mu",
64
+ "na",
65
+ "nad",
66
+ "nakon",
67
+ "nam",
68
+ "nama",
69
+ "nas",
70
+ "naš",
71
+ "naša",
72
+ "naše",
73
+ "našeg",
74
+ "ne",
75
+ "nego",
76
+ "neka",
77
+ "neki",
78
+ "nekog",
79
+ "neku",
80
+ "nema",
81
+ "netko",
82
+ "neće",
83
+ "nećemo",
84
+ "nećete",
85
+ "nećeš",
86
+ "neću",
87
+ "nešto",
88
+ "ni",
89
+ "nije",
90
+ "nikoga",
91
+ "nikoje",
92
+ "nikoju",
93
+ "nisam",
94
+ "nisi",
95
+ "nismo",
96
+ "niste",
97
+ "nisu",
98
+ "njega",
99
+ "njegov",
100
+ "njegova",
101
+ "njegovo",
102
+ "njemu",
103
+ "njezin",
104
+ "njezina",
105
+ "njezino",
106
+ "njih",
107
+ "njihov",
108
+ "njihova",
109
+ "njihovo",
110
+ "njim",
111
+ "njima",
112
+ "njoj",
113
+ "nju",
114
+ "no",
115
+ "o",
116
+ "od",
117
+ "odmah",
118
+ "on",
119
+ "ona",
120
+ "oni",
121
+ "ono",
122
+ "ova",
123
+ "pa",
124
+ "pak",
125
+ "po",
126
+ "pod",
127
+ "pored",
128
+ "prije",
129
+ "s",
130
+ "sa",
131
+ "sam",
132
+ "samo",
133
+ "se",
134
+ "sebe",
135
+ "sebi",
136
+ "si",
137
+ "smo",
138
+ "ste",
139
+ "su",
140
+ "sve",
141
+ "svi",
142
+ "svog",
143
+ "svoj",
144
+ "svoja",
145
+ "svoje",
146
+ "svom",
147
+ "ta",
148
+ "tada",
149
+ "taj",
150
+ "tako",
151
+ "te",
152
+ "tebe",
153
+ "tebi",
154
+ "ti",
155
+ "to",
156
+ "toj",
157
+ "tome",
158
+ "tu",
159
+ "tvoj",
160
+ "tvoja",
161
+ "tvoje",
162
+ "u",
163
+ "uz",
164
+ "vam",
165
+ "vama",
166
+ "vas",
167
+ "vaš",
168
+ "vaša",
169
+ "vaše",
170
+ "već",
171
+ "vi",
172
+ "vrlo",
173
+ "za",
174
+ "zar",
175
+ "će",
176
+ "ćemo",
177
+ "ćete",
178
+ "ćeš",
179
+ "ću",
180
+ "što"
181
+ ]