iudex-core 1.0.0-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +21 -0
- data/Manifest.txt +9 -1
- data/Rakefile +6 -6
- data/bin/iudex-test-config +1 -1
- data/bin/iudex-url-norm +4 -4
- data/build/effective_tld_name.dat +432 -29
- data/config/mojibake +268 -0
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.1.0.jar +0 -0
- data/lib/iudex-core/mojibake.rb +73 -0
- data/lib/iudex-core.rb +8 -2
- data/pom.xml +5 -5
- data/test/test_content_fetcher.rb +37 -39
- data/test/test_content_source.rb +75 -0
- data/test/test_mojibake.rb +58 -0
- data/test/test_redirect_handler.rb +170 -0
- data/test/test_visit_manager.rb +107 -0
- data/test/test_visit_queue.rb +268 -0
- data/test/test_visit_url.rb +150 -0
- metadata +26 -16
- data/lib/iudex-core/iudex-core-1.0.0.jar +0 -0
data/config/mojibake
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
# -*- coding: utf-8 -*- mojibake: 1.0.0
|
2
|
+
/Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])/
|
3
|
+
|
4
|
+
Moji UNICODE Org CODE
|
5
|
+
+---- ---- ---- ---- ----- ---+
|
6
|
+
[Â] 00C2 0080 [] 0080
|
7
|
+
[Â] 00C2 0081 [] 0081
|
8
|
+
[Â] 00C2 0082 [] 0082
|
9
|
+
[Â] 00C2 0083 [] 0083
|
10
|
+
[Â] 00C2 0084 [] 0084
|
11
|
+
[Â
] 00C2 0085 [
] 0085
|
12
|
+
[Â] 00C2 0086 [] 0086
|
13
|
+
[Â] 00C2 0087 [] 0087
|
14
|
+
[Â] 00C2 0088 [] 0088
|
15
|
+
[Â] 00C2 0089 [] 0089
|
16
|
+
[Â] 00C2 008A [] 008A
|
17
|
+
[Â] 00C2 008B [] 008B
|
18
|
+
[Â] 00C2 008C [] 008C
|
19
|
+
[Â] 00C2 008D [] 008D
|
20
|
+
[Â] 00C2 008E [] 008E
|
21
|
+
[Â] 00C2 008F [] 008F
|
22
|
+
[Â] 00C2 0090 [] 0090
|
23
|
+
[Â] 00C2 0091 [] 0091
|
24
|
+
[Â] 00C2 0092 [] 0092
|
25
|
+
[Â] 00C2 0093 [] 0093
|
26
|
+
[Â] 00C2 0094 [] 0094
|
27
|
+
[Â] 00C2 0095 [] 0095
|
28
|
+
[Â] 00C2 0096 [] 0096
|
29
|
+
[Â] 00C2 0097 [] 0097
|
30
|
+
[Â] 00C2 0098 [] 0098
|
31
|
+
[Â] 00C2 0099 [] 0099
|
32
|
+
[Â] 00C2 009A [] 009A
|
33
|
+
[Â] 00C2 009B [] 009B
|
34
|
+
[Â] 00C2 009C [] 009C
|
35
|
+
[Â] 00C2 009D [] 009D
|
36
|
+
[Â] 00C2 009E [] 009E
|
37
|
+
[Â] 00C2 009F [] 009F
|
38
|
+
[Â ] 00C2 00A0 [ ] 00A0
|
39
|
+
[¡] 00C2 00A1 [¡] 00A1
|
40
|
+
[¢] 00C2 00A2 [¢] 00A2
|
41
|
+
[£] 00C2 00A3 [£] 00A3
|
42
|
+
[¤] 00C2 00A4 [¤] 00A4
|
43
|
+
[Â¥] 00C2 00A5 [¥] 00A5
|
44
|
+
[¦] 00C2 00A6 [¦] 00A6
|
45
|
+
[§] 00C2 00A7 [§] 00A7
|
46
|
+
[¨] 00C2 00A8 [¨] 00A8
|
47
|
+
[©] 00C2 00A9 [©] 00A9
|
48
|
+
[ª] 00C2 00AA [ª] 00AA
|
49
|
+
[«] 00C2 00AB [«] 00AB
|
50
|
+
[¬] 00C2 00AC [¬] 00AC
|
51
|
+
[Â] 00C2 00AD [] 00AD
|
52
|
+
[®] 00C2 00AE [®] 00AE
|
53
|
+
[¯] 00C2 00AF [¯] 00AF
|
54
|
+
[°] 00C2 00B0 [°] 00B0
|
55
|
+
[±] 00C2 00B1 [±] 00B1
|
56
|
+
[²] 00C2 00B2 [²] 00B2
|
57
|
+
[³] 00C2 00B3 [³] 00B3
|
58
|
+
[´] 00C2 00B4 [´] 00B4
|
59
|
+
[µ] 00C2 00B5 [µ] 00B5
|
60
|
+
[¶] 00C2 00B6 [¶] 00B6
|
61
|
+
[·] 00C2 00B7 [·] 00B7
|
62
|
+
[¸] 00C2 00B8 [¸] 00B8
|
63
|
+
[¹] 00C2 00B9 [¹] 00B9
|
64
|
+
[º] 00C2 00BA [º] 00BA
|
65
|
+
[»] 00C2 00BB [»] 00BB
|
66
|
+
[¼] 00C2 00BC [¼] 00BC
|
67
|
+
[½] 00C2 00BD [½] 00BD
|
68
|
+
[¾] 00C2 00BE [¾] 00BE
|
69
|
+
[¿] 00C2 00BF [¿] 00BF
|
70
|
+
[Œ] 00C2 0152 [] 008C
|
71
|
+
[œ] 00C2 0153 [] 009C
|
72
|
+
[Š] 00C2 0160 [] 008A
|
73
|
+
[š] 00C2 0161 [] 009A
|
74
|
+
[Ÿ] 00C2 0178 [] 009F
|
75
|
+
[ÂŽ] 00C2 017D [] 008E
|
76
|
+
[ž] 00C2 017E [] 009E
|
77
|
+
[ƒ] 00C2 0192 [] 0083
|
78
|
+
[ˆ] 00C2 02C6 [] 0088
|
79
|
+
[˜] 00C2 02DC [] 0098
|
80
|
+
[–] 00C2 2013 [] 0096
|
81
|
+
[—] 00C2 2014 [] 0097
|
82
|
+
[‘] 00C2 2018 [] 0091
|
83
|
+
[Â’] 00C2 2019 [] 0092
|
84
|
+
[‚] 00C2 201A [] 0082
|
85
|
+
[“] 00C2 201C [] 0093
|
86
|
+
[”] 00C2 201D [] 0094
|
87
|
+
[„] 00C2 201E [] 0084
|
88
|
+
[†] 00C2 2020 [] 0086
|
89
|
+
[‡] 00C2 2021 [] 0087
|
90
|
+
[•] 00C2 2022 [] 0095
|
91
|
+
[Â…] 00C2 2026 [
] 0085
|
92
|
+
[‰] 00C2 2030 [] 0089
|
93
|
+
[‹] 00C2 2039 [] 008B
|
94
|
+
[›] 00C2 203A [] 009B
|
95
|
+
[€] 00C2 20AC [] 0080
|
96
|
+
[™] 00C2 2122 [] 0099
|
97
|
+
[�] 00C2 FFFD [] 0081
|
98
|
+
[Ã] 00C3 0080 [À] 00C0
|
99
|
+
[Ã] 00C3 0081 [Á] 00C1
|
100
|
+
[Ã] 00C3 0082 [Â] 00C2
|
101
|
+
[Ã] 00C3 0083 [Ã] 00C3
|
102
|
+
[Ã] 00C3 0084 [Ä] 00C4
|
103
|
+
[Ã
] 00C3 0085 [Å] 00C5
|
104
|
+
[Ã] 00C3 0086 [Æ] 00C6
|
105
|
+
[Ã] 00C3 0087 [Ç] 00C7
|
106
|
+
[Ã] 00C3 0088 [È] 00C8
|
107
|
+
[Ã] 00C3 0089 [É] 00C9
|
108
|
+
[Ã] 00C3 008A [Ê] 00CA
|
109
|
+
[Ã] 00C3 008B [Ë] 00CB
|
110
|
+
[Ã] 00C3 008C [Ì] 00CC
|
111
|
+
[Ã] 00C3 008D [Í] 00CD
|
112
|
+
[Ã] 00C3 008E [Î] 00CE
|
113
|
+
[Ã] 00C3 008F [Ï] 00CF
|
114
|
+
[Ã] 00C3 0090 [Ð] 00D0
|
115
|
+
[Ã] 00C3 0091 [Ñ] 00D1
|
116
|
+
[Ã] 00C3 0092 [Ò] 00D2
|
117
|
+
[Ã] 00C3 0093 [Ó] 00D3
|
118
|
+
[Ã] 00C3 0094 [Ô] 00D4
|
119
|
+
[Ã] 00C3 0095 [Õ] 00D5
|
120
|
+
[Ã] 00C3 0096 [Ö] 00D6
|
121
|
+
[Ã] 00C3 0097 [×] 00D7
|
122
|
+
[Ã] 00C3 0098 [Ø] 00D8
|
123
|
+
[Ã] 00C3 0099 [Ù] 00D9
|
124
|
+
[Ã] 00C3 009A [Ú] 00DA
|
125
|
+
[Ã] 00C3 009B [Û] 00DB
|
126
|
+
[Ã] 00C3 009C [Ü] 00DC
|
127
|
+
[Ã] 00C3 009D [Ý] 00DD
|
128
|
+
[Ã] 00C3 009E [Þ] 00DE
|
129
|
+
[Ã] 00C3 009F [ß] 00DF
|
130
|
+
[Ã ] 00C3 00A0 [à] 00E0
|
131
|
+
[á] 00C3 00A1 [á] 00E1
|
132
|
+
[â] 00C3 00A2 [â] 00E2
|
133
|
+
[ã] 00C3 00A3 [ã] 00E3
|
134
|
+
[ä] 00C3 00A4 [ä] 00E4
|
135
|
+
[Ã¥] 00C3 00A5 [å] 00E5
|
136
|
+
[æ] 00C3 00A6 [æ] 00E6
|
137
|
+
[ç] 00C3 00A7 [ç] 00E7
|
138
|
+
[è] 00C3 00A8 [è] 00E8
|
139
|
+
[é] 00C3 00A9 [é] 00E9
|
140
|
+
[ê] 00C3 00AA [ê] 00EA
|
141
|
+
[ë] 00C3 00AB [ë] 00EB
|
142
|
+
[ì] 00C3 00AC [ì] 00EC
|
143
|
+
[Ã] 00C3 00AD [í] 00ED
|
144
|
+
[î] 00C3 00AE [î] 00EE
|
145
|
+
[ï] 00C3 00AF [ï] 00EF
|
146
|
+
[ð] 00C3 00B0 [ð] 00F0
|
147
|
+
[ñ] 00C3 00B1 [ñ] 00F1
|
148
|
+
[ò] 00C3 00B2 [ò] 00F2
|
149
|
+
[ó] 00C3 00B3 [ó] 00F3
|
150
|
+
[ô] 00C3 00B4 [ô] 00F4
|
151
|
+
[õ] 00C3 00B5 [õ] 00F5
|
152
|
+
[ö] 00C3 00B6 [ö] 00F6
|
153
|
+
[÷] 00C3 00B7 [÷] 00F7
|
154
|
+
[ø] 00C3 00B8 [ø] 00F8
|
155
|
+
[ù] 00C3 00B9 [ù] 00F9
|
156
|
+
[ú] 00C3 00BA [ú] 00FA
|
157
|
+
[û] 00C3 00BB [û] 00FB
|
158
|
+
[ü] 00C3 00BC [ü] 00FC
|
159
|
+
[ý] 00C3 00BD [ý] 00FD
|
160
|
+
[þ] 00C3 00BE [þ] 00FE
|
161
|
+
[ÿ] 00C3 00BF [ÿ] 00FF
|
162
|
+
[ÃŒ] 00C3 0152 [Ì] 00CC
|
163
|
+
[Ü] 00C3 0153 [Ü] 00DC
|
164
|
+
[Ê] 00C3 0160 [Ê] 00CA
|
165
|
+
[Ú] 00C3 0161 [Ú] 00DA
|
166
|
+
[ß] 00C3 0178 [ß] 00DF
|
167
|
+
[ÃŽ] 00C3 017D [Î] 00CE
|
168
|
+
[Þ] 00C3 017E [Þ] 00DE
|
169
|
+
[Ã] 00C3 0192 [Ã] 00C3
|
170
|
+
[È] 00C3 02C6 [È] 00C8
|
171
|
+
[Ø] 00C3 02DC [Ø] 00D8
|
172
|
+
[Ö] 00C3 2013 [Ö] 00D6
|
173
|
+
[×] 00C3 2014 [×] 00D7
|
174
|
+
[Ñ] 00C3 2018 [Ñ] 00D1
|
175
|
+
[Ã’] 00C3 2019 [Ò] 00D2
|
176
|
+
[Â] 00C3 201A [Â] 00C2
|
177
|
+
[Ó] 00C3 201C [Ó] 00D3
|
178
|
+
[Ô] 00C3 201D [Ô] 00D4
|
179
|
+
[Ä] 00C3 201E [Ä] 00C4
|
180
|
+
[Æ] 00C3 2020 [Æ] 00C6
|
181
|
+
[Ç] 00C3 2021 [Ç] 00C7
|
182
|
+
[Õ] 00C3 2022 [Õ] 00D5
|
183
|
+
[Ã…] 00C3 2026 [Å] 00C5
|
184
|
+
[É] 00C3 2030 [É] 00C9
|
185
|
+
[Ë] 00C3 2039 [Ë] 00CB
|
186
|
+
[Û] 00C3 203A [Û] 00DB
|
187
|
+
[À] 00C3 20AC [À] 00C0
|
188
|
+
[Ù] 00C3 2122 [Ù] 00D9
|
189
|
+
[Ã�] 00C3 FFFD [Á] 00C1
|
190
|
+
[Å] 00C5 0092 [Œ] 0152
|
191
|
+
[Å] 00C5 0093 [œ] 0153
|
192
|
+
[Å ] 00C5 00A0 [Š] 0160
|
193
|
+
[Å¡] 00C5 00A1 [š] 0161
|
194
|
+
[Ÿ] 00C5 00B8 [Ÿ] 0178
|
195
|
+
[Ž] 00C5 00BD [Ž] 017D
|
196
|
+
[ž] 00C5 00BE [ž] 017E
|
197
|
+
[Å’] 00C5 2019 [Œ] 0152
|
198
|
+
[Å“] 00C5 201C [œ] 0153
|
199
|
+
[Æ] 00C6 0092 [ƒ] 0192
|
200
|
+
[Æ’] 00C6 2019 [ƒ] 0192
|
201
|
+
[Ë] 00CB 0086 [ˆ] 02C6
|
202
|
+
[Ë] 00CB 009C [˜] 02DC
|
203
|
+
[Ëœ] 00CB 0153 [˜] 02DC
|
204
|
+
[ˆ] 00CB 2020 [ˆ] 02C6
|
205
|
+
[â] 00E2 0080 0080 [ ] 2000
|
206
|
+
[â] 00E2 0080 0081 [ ] 2001
|
207
|
+
[â] 00E2 0080 0082 [ ] 2002
|
208
|
+
[â] 00E2 0080 0083 [ ] 2003
|
209
|
+
[â] 00E2 0080 0084 [ ] 2004
|
210
|
+
[â
] 00E2 0080 0085 [ ] 2005
|
211
|
+
[â] 00E2 0080 0086 [ ] 2006
|
212
|
+
[â] 00E2 0080 0087 [ ] 2007
|
213
|
+
[â] 00E2 0080 0088 [ ] 2008
|
214
|
+
[â] 00E2 0080 0089 [ ] 2009
|
215
|
+
[â] 00E2 0080 008A [ ] 200A
|
216
|
+
[â] 00E2 0080 008B [] 200B
|
217
|
+
[â] 00E2 0080 0093 [–] 2013
|
218
|
+
[â] 00E2 0080 0094 [—] 2014
|
219
|
+
[â] 00E2 0080 0098 [‘] 2018
|
220
|
+
[â] 00E2 0080 0099 [’] 2019
|
221
|
+
[â] 00E2 0080 009A [‚] 201A
|
222
|
+
[â] 00E2 0080 009C [“] 201C
|
223
|
+
[â] 00E2 0080 009D [”] 201D
|
224
|
+
[â] 00E2 0080 009E [„] 201E
|
225
|
+
[â ] 00E2 0080 00A0 [†] 2020
|
226
|
+
[â¡] 00E2 0080 00A1 [‡] 2021
|
227
|
+
[â¢] 00E2 0080 00A2 [•] 2022
|
228
|
+
[â¦] 00E2 0080 00A6 […] 2026
|
229
|
+
[â°] 00E2 0080 00B0 [‰] 2030
|
230
|
+
[â¹] 00E2 0080 00B9 [‹] 2039
|
231
|
+
[âº] 00E2 0080 00BA [›] 203A
|
232
|
+
[â ] 00E2 0081 00A0 [] 2060
|
233
|
+
[â¬] 00E2 0082 00AC [€] 20AC
|
234
|
+
[â¢] 00E2 0084 00A2 [™] 2122
|
235
|
+
[€] 00E2 201A 00AC [€] 20AC
|
236
|
+
[â„¢] 00E2 201E 00A2 [™] 2122
|
237
|
+
[â€] 00E2 20AC 0081 [ ] 2001
|
238
|
+
[â€] 00E2 20AC 009D [”] 201D
|
239
|
+
[†] 00E2 20AC 00A0 [†] 2020
|
240
|
+
[‡] 00E2 20AC 00A1 [‡] 2021
|
241
|
+
[•] 00E2 20AC 00A2 [•] 2022
|
242
|
+
[…] 00E2 20AC 00A6 […] 2026
|
243
|
+
[‰] 00E2 20AC 00B0 [‰] 2030
|
244
|
+
[‹] 00E2 20AC 00B9 [‹] 2039
|
245
|
+
[›] 00E2 20AC 00BA [›] 203A
|
246
|
+
[“] 00E2 20AC 0153 [“] 201C
|
247
|
+
[ ] 00E2 20AC 0160 [ ] 200A
|
248
|
+
[‚] 00E2 20AC 0161 [‚] 201A
|
249
|
+
[„] 00E2 20AC 017E [„] 201E
|
250
|
+
[ ] 00E2 20AC 0192 [ ] 2003
|
251
|
+
[ ] 00E2 20AC 02C6 [ ] 2008
|
252
|
+
[‘] 00E2 20AC 02DC [‘] 2018
|
253
|
+
[ ] 00E2 20AC 201A [ ] 2002
|
254
|
+
[–] 00E2 20AC 201C [–] 2013
|
255
|
+
[—] 00E2 20AC 201D [—] 2014
|
256
|
+
[ ] 00E2 20AC 201E [ ] 2004
|
257
|
+
[ ] 00E2 20AC 2020 [ ] 2006
|
258
|
+
[ ] 00E2 20AC 2021 [ ] 2007
|
259
|
+
[ ] 00E2 20AC 2026 [ ] 2005
|
260
|
+
[ ] 00E2 20AC 2030 [ ] 2009
|
261
|
+
[​] 00E2 20AC 2039 [] 200B
|
262
|
+
[ ] 00E2 20AC 20AC [ ] 2000
|
263
|
+
[’] 00E2 20AC 2122 [’] 2019
|
264
|
+
[â€�] 00E2 20AC FFFD [”] 201D
|
265
|
+
[â� ] 00E2 FFFD 00A0 [] 2060
|
266
|
+
[] 00EF 00BB 00BF [] FEFF
|
267
|
+
[�] 00EF 00BF 00BD [�] FFFD
|
268
|
+
[￾] 00EF 00BF 00BE [] FFFE
|
data/lib/iudex-core/base.rb
CHANGED
Binary file
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
require 'java'
|
19
|
+
|
20
|
+
module Iudex::Core
|
21
|
+
|
22
|
+
module MojiBake
|
23
|
+
DEFAULT_CONFIG = File.join( File.dirname( __FILE__ ),
|
24
|
+
'..', '..', 'config', 'mojibake' )
|
25
|
+
|
26
|
+
def self.load_config( file = DEFAULT_CONFIG )
|
27
|
+
regex = nil
|
28
|
+
mojis = []
|
29
|
+
File.open( file ) do |fin|
|
30
|
+
fin.each do |line|
|
31
|
+
case line
|
32
|
+
when %r{^/([^/]+)/$}
|
33
|
+
regex = $1
|
34
|
+
when /^\[.*?\]\s+([0-9A-F ]+)\s+\[.*\]\s+([0-9A-F]+)$/
|
35
|
+
mojis << [ $1.split( ' ' ), $2 ]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
mh = Java::java.util.HashMap.new( 512 )
|
41
|
+
mojis.each do | moji, rpl |
|
42
|
+
mh.put( jstring( moji ), jstring( rpl ) )
|
43
|
+
end
|
44
|
+
[ regex, mh ]
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def self.jstring( cps )
|
50
|
+
cs = cps.map { |cp| cp.hex }.to_java( :char )
|
51
|
+
Java::java.lang.String.new( cs )
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
module Filters
|
57
|
+
import 'iudex.core.filters.MojiBakeFilter'
|
58
|
+
|
59
|
+
# Re-open iudex.core.filters.MojiBakeFilter to add config file
|
60
|
+
# based initialization.
|
61
|
+
class MojiBakeFilter
|
62
|
+
|
63
|
+
# Alt constructor taking a configuration file in `mojibake -t`
|
64
|
+
# format.
|
65
|
+
def initialize( key, config_file = MojiBake::DEFAULT_CONFIG )
|
66
|
+
super( key, *MojiBake.load_config( config_file ) )
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
data/lib/iudex-core.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
#--
|
2
3
|
# Copyright (c) 2008-2011 David Kellum
|
3
4
|
#
|
@@ -31,7 +32,9 @@ module Iudex
|
|
31
32
|
|
32
33
|
import 'iudex.core.ContentKeys'
|
33
34
|
import 'iudex.core.ContentSource'
|
34
|
-
import 'iudex.core.
|
35
|
+
import 'iudex.core.VisitManager'
|
36
|
+
import 'iudex.core.VisitQueueFactory'
|
37
|
+
import 'iudex.core.VisitQueue'
|
35
38
|
import 'iudex.core.VisitURL'
|
36
39
|
|
37
40
|
module Filters
|
@@ -40,10 +43,13 @@ module Iudex
|
|
40
43
|
import 'iudex.core.filters.DateChangeFilter'
|
41
44
|
import 'iudex.core.filters.DefaultFilter'
|
42
45
|
import 'iudex.core.filters.FutureDateFilter'
|
43
|
-
import 'iudex.core.filters.
|
46
|
+
import 'iudex.core.filters.RedirectHandler'
|
47
|
+
import 'iudex.core.filters.Revisitor'
|
44
48
|
import 'iudex.core.filters.TextCtrlWSFilter'
|
45
49
|
import 'iudex.core.filters.UHashMDCSetter'
|
46
50
|
end
|
47
51
|
|
48
52
|
end
|
49
53
|
end
|
54
|
+
|
55
|
+
require 'iudex-core/mojibake'
|
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-core</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.1.0</version>
|
9
9
|
<name>Iudex Core System</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.1</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -30,19 +30,19 @@
|
|
30
30
|
<dependency>
|
31
31
|
<groupId>iudex</groupId>
|
32
32
|
<artifactId>iudex-filter</artifactId>
|
33
|
-
<version>[1.
|
33
|
+
<version>[1.1,1.2)</version>
|
34
34
|
</dependency>
|
35
35
|
|
36
36
|
<dependency>
|
37
37
|
<groupId>iudex</groupId>
|
38
38
|
<artifactId>iudex-http</artifactId>
|
39
|
-
<version>[1.
|
39
|
+
<version>[1.1,1.2)</version>
|
40
40
|
</dependency>
|
41
41
|
|
42
42
|
<dependency>
|
43
43
|
<groupId>iudex</groupId>
|
44
44
|
<artifactId>iudex-barc</artifactId>
|
45
|
-
<version>[1.
|
45
|
+
<version>[1.1,1.2)</version>
|
46
46
|
</dependency>
|
47
47
|
|
48
48
|
<dependency>
|
@@ -37,10 +37,16 @@ module TestHTTPMocks
|
|
37
37
|
WEAK_ETAG = 'W/"weak-etag"'
|
38
38
|
|
39
39
|
class MockSession < Iudex::HTTP::HTTPSession
|
40
|
-
import 'com.gravitext.util.ByteBufferInputStream'
|
41
40
|
import 'java.nio.ByteBuffer'
|
42
41
|
include Iudex::HTTP
|
43
42
|
|
43
|
+
attr_writer :status
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
super()
|
47
|
+
@status = 200
|
48
|
+
end
|
49
|
+
|
44
50
|
def requestHeaders
|
45
51
|
[ ]
|
46
52
|
end
|
@@ -49,12 +55,12 @@ module TestHTTPMocks
|
|
49
55
|
[ Header.new( "ETag", WEAK_ETAG ) ]
|
50
56
|
end
|
51
57
|
|
52
|
-
def
|
53
|
-
|
58
|
+
def statusCode
|
59
|
+
@status
|
54
60
|
end
|
55
61
|
|
56
|
-
def
|
57
|
-
|
62
|
+
def responseBody
|
63
|
+
ByteBuffer::wrap( "BODY".to_java_bytes )
|
58
64
|
end
|
59
65
|
|
60
66
|
def statusText
|
@@ -62,7 +68,10 @@ module TestHTTPMocks
|
|
62
68
|
end
|
63
69
|
|
64
70
|
def execute( handler )
|
65
|
-
handler.
|
71
|
+
handler.session_completed( self )
|
72
|
+
end
|
73
|
+
|
74
|
+
def close
|
66
75
|
end
|
67
76
|
end
|
68
77
|
|
@@ -88,6 +97,20 @@ module TestHTTPMocks
|
|
88
97
|
end
|
89
98
|
end
|
90
99
|
|
100
|
+
import 'iudex.core.VisitCounter'
|
101
|
+
|
102
|
+
class TestVisitCounter
|
103
|
+
include VisitCounter
|
104
|
+
attr_reader :released
|
105
|
+
|
106
|
+
def add( order )
|
107
|
+
end
|
108
|
+
|
109
|
+
def release( acquired, newOrder )
|
110
|
+
@released = acquired.url
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
91
114
|
end
|
92
115
|
|
93
116
|
class TestContentFetcher < MiniTest::Unit::TestCase
|
@@ -119,54 +142,27 @@ class TestContentFetcher < MiniTest::Unit::TestCase
|
|
119
142
|
def test_304
|
120
143
|
client = MockHTTPClient.new
|
121
144
|
def client.request( session, handler )
|
122
|
-
|
145
|
+
session.status = 304
|
146
|
+
handler.session_completed( session )
|
123
147
|
end
|
124
148
|
fetch( create_content, client ) do |out|
|
125
149
|
assert_equal( DEFAULT_URL, out.url.to_s )
|
126
150
|
assert_equal( 304, out.status )
|
127
|
-
assert_nil( out.etag )
|
128
|
-
assert_nil( out.source )
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
REDIRECT_URL = "http://gravitext.com/redirect#foo"
|
133
|
-
REDIRECT_NORM = "http://gravitext.com/redirect"
|
134
|
-
|
135
|
-
def test_redirect
|
136
|
-
client = MockHTTPClient.new
|
137
|
-
def client.create_session
|
138
|
-
s = MockSession.new
|
139
|
-
def s.execute( handler )
|
140
|
-
self.url = REDIRECT_URL
|
141
|
-
super
|
142
|
-
end
|
143
|
-
s
|
144
|
-
end
|
145
|
-
fetch( create_content, client ) do |out|
|
146
|
-
assert_equal( REDIRECT_NORM, out.url.to_s )
|
147
|
-
assert_equal( 200, out.status )
|
148
|
-
|
149
|
-
ref = out.referer
|
150
|
-
|
151
|
-
assert_equal( DEFAULT_URL, ref.url.to_s )
|
152
|
-
assert_equal( 302, ref.status )
|
153
|
-
assert_equal( REDIRECT_NORM, ref.referent.url.to_s )
|
154
151
|
end
|
155
152
|
end
|
156
153
|
|
157
154
|
import "java.net.UnknownHostException"
|
158
|
-
import "java.io.IOException"
|
159
155
|
|
160
156
|
def test_connect_error
|
161
157
|
client = MockHTTPClient.new
|
162
158
|
def client.create_session
|
163
159
|
s = MockSession.new
|
164
160
|
def s.execute( handler )
|
165
|
-
|
166
|
-
|
161
|
+
self.error = UnknownHostException.new( "foobar.com" )
|
162
|
+
handler.session_completed( self )
|
167
163
|
end
|
168
|
-
def s.
|
169
|
-
|
164
|
+
def s.statusCode
|
165
|
+
-1
|
170
166
|
end
|
171
167
|
def s.responseHeaders
|
172
168
|
nil
|
@@ -182,7 +178,9 @@ class TestContentFetcher < MiniTest::Unit::TestCase
|
|
182
178
|
|
183
179
|
def fetch( content, client = MockHTTPClient.new, &block )
|
184
180
|
rec = TestReceiver.new( &block )
|
181
|
+
counter = TestVisitCounter.new
|
185
182
|
cf = ContentFetcher.new( client,
|
183
|
+
counter,
|
186
184
|
FilterChain.new( "test-rec", [ rec ] ) )
|
187
185
|
cf.filter( content )
|
188
186
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
require 'iudex-core'
|
22
|
+
|
23
|
+
class TestContentSource < MiniTest::Unit::TestCase
|
24
|
+
include Iudex::Core
|
25
|
+
|
26
|
+
import 'java.nio.ByteBuffer'
|
27
|
+
import 'java.nio.charset.Charset'
|
28
|
+
|
29
|
+
def self.charset( name )
|
30
|
+
Charset::lookup( name )
|
31
|
+
end
|
32
|
+
|
33
|
+
UTF8 = charset( "UTF-8" )
|
34
|
+
ISO1 = charset( "ISO-8859-1" )
|
35
|
+
|
36
|
+
def setup
|
37
|
+
@cs = ContentSource.new( ByteBuffer::wrap( "any".to_java_bytes ) )
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_default_encoding
|
41
|
+
refute( @cs.default_encoding )
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_default_encoding
|
45
|
+
assert( @cs.set_default_encoding( UTF8, 0.0 ) )
|
46
|
+
assert_equal( UTF8, @cs.default_encoding )
|
47
|
+
assert_in_epsilon( 0.0, @cs.encoding_confidence )
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_default_encoding_additive
|
51
|
+
2.times { assert( @cs.set_default_encoding( UTF8, 0.10 ) ) }
|
52
|
+
assert_equal( UTF8, @cs.default_encoding )
|
53
|
+
assert_in_epsilon( 0.20, @cs.encoding_confidence )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_default_encoding_map
|
57
|
+
assert( @cs.set_default_encoding( { UTF8 => f( 0.10 ),
|
58
|
+
ISO1 => f( 0.20 ) } ) )
|
59
|
+
assert_equal( ISO1, @cs.default_encoding )
|
60
|
+
assert_in_epsilon( 0.20, @cs.encoding_confidence )
|
61
|
+
|
62
|
+
refute( @cs.set_default_encoding( {} ) )
|
63
|
+
refute( @cs.set_default_encoding( { UTF8 => f( 0.05 ) } ) )
|
64
|
+
assert( @cs.set_default_encoding( { UTF8 => f( 0.07 ),
|
65
|
+
ISO1 => f( 0.01 ) } ) )
|
66
|
+
|
67
|
+
assert_equal( UTF8, @cs.default_encoding )
|
68
|
+
assert_in_epsilon( 0.22, @cs.encoding_confidence )
|
69
|
+
end
|
70
|
+
|
71
|
+
def f( v )
|
72
|
+
Java::java.lang.Float.new( v )
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-core/mojibake'
|
23
|
+
|
24
|
+
class TestMojiBake < MiniTest::Unit::TestCase
|
25
|
+
include Gravitext::HTMap
|
26
|
+
include Iudex::Core
|
27
|
+
include Iudex::Core::Filters
|
28
|
+
|
29
|
+
UniMap.define_accessors
|
30
|
+
|
31
|
+
FILTER = MojiBakeFilter.new( ContentKeys::SUMMARY )
|
32
|
+
|
33
|
+
def test_nomatch_recover
|
34
|
+
assert_filter( '', '' )
|
35
|
+
assert_filter( 'ascii', 'ascii' )
|
36
|
+
assert_filter( 'Â', 'Â' )
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_simple_recover
|
40
|
+
assert_filter( '[°]', '[°]' )
|
41
|
+
assert_filter( '“quoted”', '“quotedâ€�' )
|
42
|
+
assert_filter( '“quoted”', 'âquotedâ€' )
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_recursive_recover
|
46
|
+
assert_filter( '°', '°' )
|
47
|
+
assert_filter( 'AP – Greenlake', 'AP – Greenlake' )
|
48
|
+
assert_filter( 'you’re', 'you’re' )
|
49
|
+
end
|
50
|
+
|
51
|
+
def assert_filter( output, input )
|
52
|
+
map = UniMap.new
|
53
|
+
map.summary = input
|
54
|
+
assert( FILTER.filter( map ) )
|
55
|
+
assert_equal( output, map.summary.to_s, "From: #{input}" )
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|