iudex-core 1.0.0-java → 1.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.rdoc +21 -0
- data/Manifest.txt +9 -1
- data/Rakefile +6 -6
- data/bin/iudex-test-config +1 -1
- data/bin/iudex-url-norm +4 -4
- data/build/effective_tld_name.dat +432 -29
- data/config/mojibake +268 -0
- data/lib/iudex-core/base.rb +1 -1
- data/lib/iudex-core/iudex-core-1.1.0.jar +0 -0
- data/lib/iudex-core/mojibake.rb +73 -0
- data/lib/iudex-core.rb +8 -2
- data/pom.xml +5 -5
- data/test/test_content_fetcher.rb +37 -39
- data/test/test_content_source.rb +75 -0
- data/test/test_mojibake.rb +58 -0
- data/test/test_redirect_handler.rb +170 -0
- data/test/test_visit_manager.rb +107 -0
- data/test/test_visit_queue.rb +268 -0
- data/test/test_visit_url.rb +150 -0
- metadata +26 -16
- data/lib/iudex-core/iudex-core-1.0.0.jar +0 -0
data/config/mojibake
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
# -*- coding: utf-8 -*- mojibake: 1.0.0
|
2
|
+
/Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])/
|
3
|
+
|
4
|
+
Moji UNICODE Org CODE
|
5
|
+
+---- ---- ---- ---- ----- ---+
|
6
|
+
[Â] 00C2 0080 [] 0080
|
7
|
+
[Â] 00C2 0081 [] 0081
|
8
|
+
[Â] 00C2 0082 [] 0082
|
9
|
+
[Â] 00C2 0083 [] 0083
|
10
|
+
[Â] 00C2 0084 [] 0084
|
11
|
+
[Â
] 00C2 0085 [
] 0085
|
12
|
+
[Â] 00C2 0086 [] 0086
|
13
|
+
[Â] 00C2 0087 [] 0087
|
14
|
+
[Â] 00C2 0088 [] 0088
|
15
|
+
[Â] 00C2 0089 [] 0089
|
16
|
+
[Â] 00C2 008A [] 008A
|
17
|
+
[Â] 00C2 008B [] 008B
|
18
|
+
[Â] 00C2 008C [] 008C
|
19
|
+
[Â] 00C2 008D [] 008D
|
20
|
+
[Â] 00C2 008E [] 008E
|
21
|
+
[Â] 00C2 008F [] 008F
|
22
|
+
[Â] 00C2 0090 [] 0090
|
23
|
+
[Â] 00C2 0091 [] 0091
|
24
|
+
[Â] 00C2 0092 [] 0092
|
25
|
+
[Â] 00C2 0093 [] 0093
|
26
|
+
[Â] 00C2 0094 [] 0094
|
27
|
+
[Â] 00C2 0095 [] 0095
|
28
|
+
[Â] 00C2 0096 [] 0096
|
29
|
+
[Â] 00C2 0097 [] 0097
|
30
|
+
[Â] 00C2 0098 [] 0098
|
31
|
+
[Â] 00C2 0099 [] 0099
|
32
|
+
[Â] 00C2 009A [] 009A
|
33
|
+
[Â] 00C2 009B [] 009B
|
34
|
+
[Â] 00C2 009C [] 009C
|
35
|
+
[Â] 00C2 009D [] 009D
|
36
|
+
[Â] 00C2 009E [] 009E
|
37
|
+
[Â] 00C2 009F [] 009F
|
38
|
+
[Â ] 00C2 00A0 [ ] 00A0
|
39
|
+
[¡] 00C2 00A1 [¡] 00A1
|
40
|
+
[¢] 00C2 00A2 [¢] 00A2
|
41
|
+
[£] 00C2 00A3 [£] 00A3
|
42
|
+
[¤] 00C2 00A4 [¤] 00A4
|
43
|
+
[Â¥] 00C2 00A5 [¥] 00A5
|
44
|
+
[¦] 00C2 00A6 [¦] 00A6
|
45
|
+
[§] 00C2 00A7 [§] 00A7
|
46
|
+
[¨] 00C2 00A8 [¨] 00A8
|
47
|
+
[©] 00C2 00A9 [©] 00A9
|
48
|
+
[ª] 00C2 00AA [ª] 00AA
|
49
|
+
[«] 00C2 00AB [«] 00AB
|
50
|
+
[¬] 00C2 00AC [¬] 00AC
|
51
|
+
[Â] 00C2 00AD [] 00AD
|
52
|
+
[®] 00C2 00AE [®] 00AE
|
53
|
+
[¯] 00C2 00AF [¯] 00AF
|
54
|
+
[°] 00C2 00B0 [°] 00B0
|
55
|
+
[±] 00C2 00B1 [±] 00B1
|
56
|
+
[²] 00C2 00B2 [²] 00B2
|
57
|
+
[³] 00C2 00B3 [³] 00B3
|
58
|
+
[´] 00C2 00B4 [´] 00B4
|
59
|
+
[µ] 00C2 00B5 [µ] 00B5
|
60
|
+
[¶] 00C2 00B6 [¶] 00B6
|
61
|
+
[·] 00C2 00B7 [·] 00B7
|
62
|
+
[¸] 00C2 00B8 [¸] 00B8
|
63
|
+
[¹] 00C2 00B9 [¹] 00B9
|
64
|
+
[º] 00C2 00BA [º] 00BA
|
65
|
+
[»] 00C2 00BB [»] 00BB
|
66
|
+
[¼] 00C2 00BC [¼] 00BC
|
67
|
+
[½] 00C2 00BD [½] 00BD
|
68
|
+
[¾] 00C2 00BE [¾] 00BE
|
69
|
+
[¿] 00C2 00BF [¿] 00BF
|
70
|
+
[Œ] 00C2 0152 [] 008C
|
71
|
+
[œ] 00C2 0153 [] 009C
|
72
|
+
[Š] 00C2 0160 [] 008A
|
73
|
+
[š] 00C2 0161 [] 009A
|
74
|
+
[Ÿ] 00C2 0178 [] 009F
|
75
|
+
[ÂŽ] 00C2 017D [] 008E
|
76
|
+
[ž] 00C2 017E [] 009E
|
77
|
+
[ƒ] 00C2 0192 [] 0083
|
78
|
+
[ˆ] 00C2 02C6 [] 0088
|
79
|
+
[˜] 00C2 02DC [] 0098
|
80
|
+
[–] 00C2 2013 [] 0096
|
81
|
+
[—] 00C2 2014 [] 0097
|
82
|
+
[‘] 00C2 2018 [] 0091
|
83
|
+
[Â’] 00C2 2019 [] 0092
|
84
|
+
[‚] 00C2 201A [] 0082
|
85
|
+
[“] 00C2 201C [] 0093
|
86
|
+
[”] 00C2 201D [] 0094
|
87
|
+
[„] 00C2 201E [] 0084
|
88
|
+
[†] 00C2 2020 [] 0086
|
89
|
+
[‡] 00C2 2021 [] 0087
|
90
|
+
[•] 00C2 2022 [] 0095
|
91
|
+
[Â…] 00C2 2026 [
] 0085
|
92
|
+
[‰] 00C2 2030 [] 0089
|
93
|
+
[‹] 00C2 2039 [] 008B
|
94
|
+
[›] 00C2 203A [] 009B
|
95
|
+
[€] 00C2 20AC [] 0080
|
96
|
+
[™] 00C2 2122 [] 0099
|
97
|
+
[�] 00C2 FFFD [] 0081
|
98
|
+
[Ã] 00C3 0080 [À] 00C0
|
99
|
+
[Ã] 00C3 0081 [Á] 00C1
|
100
|
+
[Ã] 00C3 0082 [Â] 00C2
|
101
|
+
[Ã] 00C3 0083 [Ã] 00C3
|
102
|
+
[Ã] 00C3 0084 [Ä] 00C4
|
103
|
+
[Ã
] 00C3 0085 [Å] 00C5
|
104
|
+
[Ã] 00C3 0086 [Æ] 00C6
|
105
|
+
[Ã] 00C3 0087 [Ç] 00C7
|
106
|
+
[Ã] 00C3 0088 [È] 00C8
|
107
|
+
[Ã] 00C3 0089 [É] 00C9
|
108
|
+
[Ã] 00C3 008A [Ê] 00CA
|
109
|
+
[Ã] 00C3 008B [Ë] 00CB
|
110
|
+
[Ã] 00C3 008C [Ì] 00CC
|
111
|
+
[Ã] 00C3 008D [Í] 00CD
|
112
|
+
[Ã] 00C3 008E [Î] 00CE
|
113
|
+
[Ã] 00C3 008F [Ï] 00CF
|
114
|
+
[Ã] 00C3 0090 [Ð] 00D0
|
115
|
+
[Ã] 00C3 0091 [Ñ] 00D1
|
116
|
+
[Ã] 00C3 0092 [Ò] 00D2
|
117
|
+
[Ã] 00C3 0093 [Ó] 00D3
|
118
|
+
[Ã] 00C3 0094 [Ô] 00D4
|
119
|
+
[Ã] 00C3 0095 [Õ] 00D5
|
120
|
+
[Ã] 00C3 0096 [Ö] 00D6
|
121
|
+
[Ã] 00C3 0097 [×] 00D7
|
122
|
+
[Ã] 00C3 0098 [Ø] 00D8
|
123
|
+
[Ã] 00C3 0099 [Ù] 00D9
|
124
|
+
[Ã] 00C3 009A [Ú] 00DA
|
125
|
+
[Ã] 00C3 009B [Û] 00DB
|
126
|
+
[Ã] 00C3 009C [Ü] 00DC
|
127
|
+
[Ã] 00C3 009D [Ý] 00DD
|
128
|
+
[Ã] 00C3 009E [Þ] 00DE
|
129
|
+
[Ã] 00C3 009F [ß] 00DF
|
130
|
+
[Ã ] 00C3 00A0 [à] 00E0
|
131
|
+
[á] 00C3 00A1 [á] 00E1
|
132
|
+
[â] 00C3 00A2 [â] 00E2
|
133
|
+
[ã] 00C3 00A3 [ã] 00E3
|
134
|
+
[ä] 00C3 00A4 [ä] 00E4
|
135
|
+
[Ã¥] 00C3 00A5 [å] 00E5
|
136
|
+
[æ] 00C3 00A6 [æ] 00E6
|
137
|
+
[ç] 00C3 00A7 [ç] 00E7
|
138
|
+
[è] 00C3 00A8 [è] 00E8
|
139
|
+
[é] 00C3 00A9 [é] 00E9
|
140
|
+
[ê] 00C3 00AA [ê] 00EA
|
141
|
+
[ë] 00C3 00AB [ë] 00EB
|
142
|
+
[ì] 00C3 00AC [ì] 00EC
|
143
|
+
[Ã] 00C3 00AD [í] 00ED
|
144
|
+
[î] 00C3 00AE [î] 00EE
|
145
|
+
[ï] 00C3 00AF [ï] 00EF
|
146
|
+
[ð] 00C3 00B0 [ð] 00F0
|
147
|
+
[ñ] 00C3 00B1 [ñ] 00F1
|
148
|
+
[ò] 00C3 00B2 [ò] 00F2
|
149
|
+
[ó] 00C3 00B3 [ó] 00F3
|
150
|
+
[ô] 00C3 00B4 [ô] 00F4
|
151
|
+
[õ] 00C3 00B5 [õ] 00F5
|
152
|
+
[ö] 00C3 00B6 [ö] 00F6
|
153
|
+
[÷] 00C3 00B7 [÷] 00F7
|
154
|
+
[ø] 00C3 00B8 [ø] 00F8
|
155
|
+
[ù] 00C3 00B9 [ù] 00F9
|
156
|
+
[ú] 00C3 00BA [ú] 00FA
|
157
|
+
[û] 00C3 00BB [û] 00FB
|
158
|
+
[ü] 00C3 00BC [ü] 00FC
|
159
|
+
[ý] 00C3 00BD [ý] 00FD
|
160
|
+
[þ] 00C3 00BE [þ] 00FE
|
161
|
+
[ÿ] 00C3 00BF [ÿ] 00FF
|
162
|
+
[ÃŒ] 00C3 0152 [Ì] 00CC
|
163
|
+
[Ãœ] 00C3 0153 [Ü] 00DC
|
164
|
+
[Ê] 00C3 0160 [Ê] 00CA
|
165
|
+
[Ú] 00C3 0161 [Ú] 00DA
|
166
|
+
[ß] 00C3 0178 [ß] 00DF
|
167
|
+
[ÃŽ] 00C3 017D [Î] 00CE
|
168
|
+
[Þ] 00C3 017E [Þ] 00DE
|
169
|
+
[Ã] 00C3 0192 [Ã] 00C3
|
170
|
+
[È] 00C3 02C6 [È] 00C8
|
171
|
+
[Ø] 00C3 02DC [Ø] 00D8
|
172
|
+
[Ö] 00C3 2013 [Ö] 00D6
|
173
|
+
[×] 00C3 2014 [×] 00D7
|
174
|
+
[Ñ] 00C3 2018 [Ñ] 00D1
|
175
|
+
[Ã’] 00C3 2019 [Ò] 00D2
|
176
|
+
[Â] 00C3 201A [Â] 00C2
|
177
|
+
[Ó] 00C3 201C [Ó] 00D3
|
178
|
+
[Ô] 00C3 201D [Ô] 00D4
|
179
|
+
[Ä] 00C3 201E [Ä] 00C4
|
180
|
+
[Æ] 00C3 2020 [Æ] 00C6
|
181
|
+
[Ç] 00C3 2021 [Ç] 00C7
|
182
|
+
[Õ] 00C3 2022 [Õ] 00D5
|
183
|
+
[Ã…] 00C3 2026 [Å] 00C5
|
184
|
+
[É] 00C3 2030 [É] 00C9
|
185
|
+
[Ë] 00C3 2039 [Ë] 00CB
|
186
|
+
[Û] 00C3 203A [Û] 00DB
|
187
|
+
[À] 00C3 20AC [À] 00C0
|
188
|
+
[Ù] 00C3 2122 [Ù] 00D9
|
189
|
+
[Ã�] 00C3 FFFD [Á] 00C1
|
190
|
+
[Å] 00C5 0092 [Œ] 0152
|
191
|
+
[Å] 00C5 0093 [œ] 0153
|
192
|
+
[Å ] 00C5 00A0 [Š] 0160
|
193
|
+
[Å¡] 00C5 00A1 [š] 0161
|
194
|
+
[Ÿ] 00C5 00B8 [Ÿ] 0178
|
195
|
+
[Ž] 00C5 00BD [Ž] 017D
|
196
|
+
[ž] 00C5 00BE [ž] 017E
|
197
|
+
[Å’] 00C5 2019 [Œ] 0152
|
198
|
+
[Å“] 00C5 201C [œ] 0153
|
199
|
+
[Æ] 00C6 0092 [ƒ] 0192
|
200
|
+
[Æ’] 00C6 2019 [ƒ] 0192
|
201
|
+
[Ë] 00CB 0086 [ˆ] 02C6
|
202
|
+
[Ë] 00CB 009C [˜] 02DC
|
203
|
+
[Ëœ] 00CB 0153 [˜] 02DC
|
204
|
+
[ˆ] 00CB 2020 [ˆ] 02C6
|
205
|
+
[â] 00E2 0080 0080 [ ] 2000
|
206
|
+
[â] 00E2 0080 0081 [ ] 2001
|
207
|
+
[â] 00E2 0080 0082 [ ] 2002
|
208
|
+
[â] 00E2 0080 0083 [ ] 2003
|
209
|
+
[â] 00E2 0080 0084 [ ] 2004
|
210
|
+
[â
] 00E2 0080 0085 [ ] 2005
|
211
|
+
[â] 00E2 0080 0086 [ ] 2006
|
212
|
+
[â] 00E2 0080 0087 [ ] 2007
|
213
|
+
[â] 00E2 0080 0088 [ ] 2008
|
214
|
+
[â] 00E2 0080 0089 [ ] 2009
|
215
|
+
[â] 00E2 0080 008A [ ] 200A
|
216
|
+
[â] 00E2 0080 008B [] 200B
|
217
|
+
[â] 00E2 0080 0093 [–] 2013
|
218
|
+
[â] 00E2 0080 0094 [—] 2014
|
219
|
+
[â] 00E2 0080 0098 [‘] 2018
|
220
|
+
[â] 00E2 0080 0099 [’] 2019
|
221
|
+
[â] 00E2 0080 009A [‚] 201A
|
222
|
+
[â] 00E2 0080 009C [“] 201C
|
223
|
+
[â] 00E2 0080 009D [”] 201D
|
224
|
+
[â] 00E2 0080 009E [„] 201E
|
225
|
+
[â ] 00E2 0080 00A0 [†] 2020
|
226
|
+
[â¡] 00E2 0080 00A1 [‡] 2021
|
227
|
+
[â¢] 00E2 0080 00A2 [•] 2022
|
228
|
+
[â¦] 00E2 0080 00A6 […] 2026
|
229
|
+
[â°] 00E2 0080 00B0 [‰] 2030
|
230
|
+
[â¹] 00E2 0080 00B9 [‹] 2039
|
231
|
+
[âº] 00E2 0080 00BA [›] 203A
|
232
|
+
[â ] 00E2 0081 00A0 [] 2060
|
233
|
+
[â¬] 00E2 0082 00AC [€] 20AC
|
234
|
+
[â¢] 00E2 0084 00A2 [™] 2122
|
235
|
+
[€] 00E2 201A 00AC [€] 20AC
|
236
|
+
[â„¢] 00E2 201E 00A2 [™] 2122
|
237
|
+
[â€] 00E2 20AC 0081 [ ] 2001
|
238
|
+
[â€] 00E2 20AC 009D [”] 201D
|
239
|
+
[†] 00E2 20AC 00A0 [†] 2020
|
240
|
+
[‡] 00E2 20AC 00A1 [‡] 2021
|
241
|
+
[•] 00E2 20AC 00A2 [•] 2022
|
242
|
+
[…] 00E2 20AC 00A6 […] 2026
|
243
|
+
[‰] 00E2 20AC 00B0 [‰] 2030
|
244
|
+
[‹] 00E2 20AC 00B9 [‹] 2039
|
245
|
+
[›] 00E2 20AC 00BA [›] 203A
|
246
|
+
[“] 00E2 20AC 0153 [“] 201C
|
247
|
+
[ ] 00E2 20AC 0160 [ ] 200A
|
248
|
+
[‚] 00E2 20AC 0161 [‚] 201A
|
249
|
+
[„] 00E2 20AC 017E [„] 201E
|
250
|
+
[ ] 00E2 20AC 0192 [ ] 2003
|
251
|
+
[ ] 00E2 20AC 02C6 [ ] 2008
|
252
|
+
[‘] 00E2 20AC 02DC [‘] 2018
|
253
|
+
[ ] 00E2 20AC 201A [ ] 2002
|
254
|
+
[–] 00E2 20AC 201C [–] 2013
|
255
|
+
[—] 00E2 20AC 201D [—] 2014
|
256
|
+
[ ] 00E2 20AC 201E [ ] 2004
|
257
|
+
[ ] 00E2 20AC 2020 [ ] 2006
|
258
|
+
[ ] 00E2 20AC 2021 [ ] 2007
|
259
|
+
[ ] 00E2 20AC 2026 [ ] 2005
|
260
|
+
[ ] 00E2 20AC 2030 [ ] 2009
|
261
|
+
[​] 00E2 20AC 2039 [] 200B
|
262
|
+
[ ] 00E2 20AC 20AC [ ] 2000
|
263
|
+
[’] 00E2 20AC 2122 [’] 2019
|
264
|
+
[â€�] 00E2 20AC FFFD [”] 201D
|
265
|
+
[â� ] 00E2 FFFD 00A0 [] 2060
|
266
|
+
[] 00EF 00BB 00BF [] FEFF
|
267
|
+
[�] 00EF 00BF 00BD [�] FFFD
|
268
|
+
[￾] 00EF 00BF 00BE [] FFFE
|
data/lib/iudex-core/base.rb
CHANGED
Binary file
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
require 'java'
|
19
|
+
|
20
|
+
module Iudex::Core
|
21
|
+
|
22
|
+
module MojiBake
|
23
|
+
DEFAULT_CONFIG = File.join( File.dirname( __FILE__ ),
|
24
|
+
'..', '..', 'config', 'mojibake' )
|
25
|
+
|
26
|
+
def self.load_config( file = DEFAULT_CONFIG )
|
27
|
+
regex = nil
|
28
|
+
mojis = []
|
29
|
+
File.open( file ) do |fin|
|
30
|
+
fin.each do |line|
|
31
|
+
case line
|
32
|
+
when %r{^/([^/]+)/$}
|
33
|
+
regex = $1
|
34
|
+
when /^\[.*?\]\s+([0-9A-F ]+)\s+\[.*\]\s+([0-9A-F]+)$/
|
35
|
+
mojis << [ $1.split( ' ' ), $2 ]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
mh = Java::java.util.HashMap.new( 512 )
|
41
|
+
mojis.each do | moji, rpl |
|
42
|
+
mh.put( jstring( moji ), jstring( rpl ) )
|
43
|
+
end
|
44
|
+
[ regex, mh ]
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def self.jstring( cps )
|
50
|
+
cs = cps.map { |cp| cp.hex }.to_java( :char )
|
51
|
+
Java::java.lang.String.new( cs )
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
module Filters
|
57
|
+
import 'iudex.core.filters.MojiBakeFilter'
|
58
|
+
|
59
|
+
# Re-open iudex.core.filters.MojiBakeFilter to add config file
|
60
|
+
# based initialization.
|
61
|
+
class MojiBakeFilter
|
62
|
+
|
63
|
+
# Alt constructor taking a configuration file in `mojibake -t`
|
64
|
+
# format.
|
65
|
+
def initialize( key, config_file = MojiBake::DEFAULT_CONFIG )
|
66
|
+
super( key, *MojiBake.load_config( config_file ) )
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
data/lib/iudex-core.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
#--
|
2
3
|
# Copyright (c) 2008-2011 David Kellum
|
3
4
|
#
|
@@ -31,7 +32,9 @@ module Iudex
|
|
31
32
|
|
32
33
|
import 'iudex.core.ContentKeys'
|
33
34
|
import 'iudex.core.ContentSource'
|
34
|
-
import 'iudex.core.
|
35
|
+
import 'iudex.core.VisitManager'
|
36
|
+
import 'iudex.core.VisitQueueFactory'
|
37
|
+
import 'iudex.core.VisitQueue'
|
35
38
|
import 'iudex.core.VisitURL'
|
36
39
|
|
37
40
|
module Filters
|
@@ -40,10 +43,13 @@ module Iudex
|
|
40
43
|
import 'iudex.core.filters.DateChangeFilter'
|
41
44
|
import 'iudex.core.filters.DefaultFilter'
|
42
45
|
import 'iudex.core.filters.FutureDateFilter'
|
43
|
-
import 'iudex.core.filters.
|
46
|
+
import 'iudex.core.filters.RedirectHandler'
|
47
|
+
import 'iudex.core.filters.Revisitor'
|
44
48
|
import 'iudex.core.filters.TextCtrlWSFilter'
|
45
49
|
import 'iudex.core.filters.UHashMDCSetter'
|
46
50
|
end
|
47
51
|
|
48
52
|
end
|
49
53
|
end
|
54
|
+
|
55
|
+
require 'iudex-core/mojibake'
|
data/pom.xml
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
<groupId>iudex</groupId>
|
6
6
|
<artifactId>iudex-core</artifactId>
|
7
7
|
<packaging>jar</packaging>
|
8
|
-
<version>1.
|
8
|
+
<version>1.1.0</version>
|
9
9
|
<name>Iudex Core System</name>
|
10
10
|
|
11
11
|
<parent>
|
12
12
|
<groupId>iudex</groupId>
|
13
13
|
<artifactId>iudex-parent</artifactId>
|
14
|
-
<version>1.
|
14
|
+
<version>1.1</version>
|
15
15
|
<relativePath>..</relativePath>
|
16
16
|
</parent>
|
17
17
|
|
@@ -30,19 +30,19 @@
|
|
30
30
|
<dependency>
|
31
31
|
<groupId>iudex</groupId>
|
32
32
|
<artifactId>iudex-filter</artifactId>
|
33
|
-
<version>[1.
|
33
|
+
<version>[1.1,1.2)</version>
|
34
34
|
</dependency>
|
35
35
|
|
36
36
|
<dependency>
|
37
37
|
<groupId>iudex</groupId>
|
38
38
|
<artifactId>iudex-http</artifactId>
|
39
|
-
<version>[1.
|
39
|
+
<version>[1.1,1.2)</version>
|
40
40
|
</dependency>
|
41
41
|
|
42
42
|
<dependency>
|
43
43
|
<groupId>iudex</groupId>
|
44
44
|
<artifactId>iudex-barc</artifactId>
|
45
|
-
<version>[1.
|
45
|
+
<version>[1.1,1.2)</version>
|
46
46
|
</dependency>
|
47
47
|
|
48
48
|
<dependency>
|
@@ -37,10 +37,16 @@ module TestHTTPMocks
|
|
37
37
|
WEAK_ETAG = 'W/"weak-etag"'
|
38
38
|
|
39
39
|
class MockSession < Iudex::HTTP::HTTPSession
|
40
|
-
import 'com.gravitext.util.ByteBufferInputStream'
|
41
40
|
import 'java.nio.ByteBuffer'
|
42
41
|
include Iudex::HTTP
|
43
42
|
|
43
|
+
attr_writer :status
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
super()
|
47
|
+
@status = 200
|
48
|
+
end
|
49
|
+
|
44
50
|
def requestHeaders
|
45
51
|
[ ]
|
46
52
|
end
|
@@ -49,12 +55,12 @@ module TestHTTPMocks
|
|
49
55
|
[ Header.new( "ETag", WEAK_ETAG ) ]
|
50
56
|
end
|
51
57
|
|
52
|
-
def
|
53
|
-
|
58
|
+
def statusCode
|
59
|
+
@status
|
54
60
|
end
|
55
61
|
|
56
|
-
def
|
57
|
-
|
62
|
+
def responseBody
|
63
|
+
ByteBuffer::wrap( "BODY".to_java_bytes )
|
58
64
|
end
|
59
65
|
|
60
66
|
def statusText
|
@@ -62,7 +68,10 @@ module TestHTTPMocks
|
|
62
68
|
end
|
63
69
|
|
64
70
|
def execute( handler )
|
65
|
-
handler.
|
71
|
+
handler.session_completed( self )
|
72
|
+
end
|
73
|
+
|
74
|
+
def close
|
66
75
|
end
|
67
76
|
end
|
68
77
|
|
@@ -88,6 +97,20 @@ module TestHTTPMocks
|
|
88
97
|
end
|
89
98
|
end
|
90
99
|
|
100
|
+
import 'iudex.core.VisitCounter'
|
101
|
+
|
102
|
+
class TestVisitCounter
|
103
|
+
include VisitCounter
|
104
|
+
attr_reader :released
|
105
|
+
|
106
|
+
def add( order )
|
107
|
+
end
|
108
|
+
|
109
|
+
def release( acquired, newOrder )
|
110
|
+
@released = acquired.url
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
91
114
|
end
|
92
115
|
|
93
116
|
class TestContentFetcher < MiniTest::Unit::TestCase
|
@@ -119,54 +142,27 @@ class TestContentFetcher < MiniTest::Unit::TestCase
|
|
119
142
|
def test_304
|
120
143
|
client = MockHTTPClient.new
|
121
144
|
def client.request( session, handler )
|
122
|
-
|
145
|
+
session.status = 304
|
146
|
+
handler.session_completed( session )
|
123
147
|
end
|
124
148
|
fetch( create_content, client ) do |out|
|
125
149
|
assert_equal( DEFAULT_URL, out.url.to_s )
|
126
150
|
assert_equal( 304, out.status )
|
127
|
-
assert_nil( out.etag )
|
128
|
-
assert_nil( out.source )
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
REDIRECT_URL = "http://gravitext.com/redirect#foo"
|
133
|
-
REDIRECT_NORM = "http://gravitext.com/redirect"
|
134
|
-
|
135
|
-
def test_redirect
|
136
|
-
client = MockHTTPClient.new
|
137
|
-
def client.create_session
|
138
|
-
s = MockSession.new
|
139
|
-
def s.execute( handler )
|
140
|
-
self.url = REDIRECT_URL
|
141
|
-
super
|
142
|
-
end
|
143
|
-
s
|
144
|
-
end
|
145
|
-
fetch( create_content, client ) do |out|
|
146
|
-
assert_equal( REDIRECT_NORM, out.url.to_s )
|
147
|
-
assert_equal( 200, out.status )
|
148
|
-
|
149
|
-
ref = out.referer
|
150
|
-
|
151
|
-
assert_equal( DEFAULT_URL, ref.url.to_s )
|
152
|
-
assert_equal( 302, ref.status )
|
153
|
-
assert_equal( REDIRECT_NORM, ref.referent.url.to_s )
|
154
151
|
end
|
155
152
|
end
|
156
153
|
|
157
154
|
import "java.net.UnknownHostException"
|
158
|
-
import "java.io.IOException"
|
159
155
|
|
160
156
|
def test_connect_error
|
161
157
|
client = MockHTTPClient.new
|
162
158
|
def client.create_session
|
163
159
|
s = MockSession.new
|
164
160
|
def s.execute( handler )
|
165
|
-
|
166
|
-
|
161
|
+
self.error = UnknownHostException.new( "foobar.com" )
|
162
|
+
handler.session_completed( self )
|
167
163
|
end
|
168
|
-
def s.
|
169
|
-
|
164
|
+
def s.statusCode
|
165
|
+
-1
|
170
166
|
end
|
171
167
|
def s.responseHeaders
|
172
168
|
nil
|
@@ -182,7 +178,9 @@ class TestContentFetcher < MiniTest::Unit::TestCase
|
|
182
178
|
|
183
179
|
def fetch( content, client = MockHTTPClient.new, &block )
|
184
180
|
rec = TestReceiver.new( &block )
|
181
|
+
counter = TestVisitCounter.new
|
185
182
|
cf = ContentFetcher.new( client,
|
183
|
+
counter,
|
186
184
|
FilterChain.new( "test-rec", [ rec ] ) )
|
187
185
|
cf.filter( content )
|
188
186
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
require 'iudex-core'
|
22
|
+
|
23
|
+
class TestContentSource < MiniTest::Unit::TestCase
|
24
|
+
include Iudex::Core
|
25
|
+
|
26
|
+
import 'java.nio.ByteBuffer'
|
27
|
+
import 'java.nio.charset.Charset'
|
28
|
+
|
29
|
+
def self.charset( name )
|
30
|
+
Charset::lookup( name )
|
31
|
+
end
|
32
|
+
|
33
|
+
UTF8 = charset( "UTF-8" )
|
34
|
+
ISO1 = charset( "ISO-8859-1" )
|
35
|
+
|
36
|
+
def setup
|
37
|
+
@cs = ContentSource.new( ByteBuffer::wrap( "any".to_java_bytes ) )
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_default_encoding
|
41
|
+
refute( @cs.default_encoding )
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_default_encoding
|
45
|
+
assert( @cs.set_default_encoding( UTF8, 0.0 ) )
|
46
|
+
assert_equal( UTF8, @cs.default_encoding )
|
47
|
+
assert_in_epsilon( 0.0, @cs.encoding_confidence )
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_default_encoding_additive
|
51
|
+
2.times { assert( @cs.set_default_encoding( UTF8, 0.10 ) ) }
|
52
|
+
assert_equal( UTF8, @cs.default_encoding )
|
53
|
+
assert_in_epsilon( 0.20, @cs.encoding_confidence )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_default_encoding_map
|
57
|
+
assert( @cs.set_default_encoding( { UTF8 => f( 0.10 ),
|
58
|
+
ISO1 => f( 0.20 ) } ) )
|
59
|
+
assert_equal( ISO1, @cs.default_encoding )
|
60
|
+
assert_in_epsilon( 0.20, @cs.encoding_confidence )
|
61
|
+
|
62
|
+
refute( @cs.set_default_encoding( {} ) )
|
63
|
+
refute( @cs.set_default_encoding( { UTF8 => f( 0.05 ) } ) )
|
64
|
+
assert( @cs.set_default_encoding( { UTF8 => f( 0.07 ),
|
65
|
+
ISO1 => f( 0.01 ) } ) )
|
66
|
+
|
67
|
+
assert_equal( UTF8, @cs.default_encoding )
|
68
|
+
assert_in_epsilon( 0.22, @cs.encoding_confidence )
|
69
|
+
end
|
70
|
+
|
71
|
+
def f( v )
|
72
|
+
Java::java.lang.Float.new( v )
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-core/mojibake'
|
23
|
+
|
24
|
+
class TestMojiBake < MiniTest::Unit::TestCase
|
25
|
+
include Gravitext::HTMap
|
26
|
+
include Iudex::Core
|
27
|
+
include Iudex::Core::Filters
|
28
|
+
|
29
|
+
UniMap.define_accessors
|
30
|
+
|
31
|
+
FILTER = MojiBakeFilter.new( ContentKeys::SUMMARY )
|
32
|
+
|
33
|
+
def test_nomatch_recover
|
34
|
+
assert_filter( '', '' )
|
35
|
+
assert_filter( 'ascii', 'ascii' )
|
36
|
+
assert_filter( 'Â', 'Â' )
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_simple_recover
|
40
|
+
assert_filter( '[°]', '[°]' )
|
41
|
+
assert_filter( '“quoted”', '“quotedâ€�' )
|
42
|
+
assert_filter( '“quoted”', 'âquotedâ€' )
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_recursive_recover
|
46
|
+
assert_filter( '°', '°' )
|
47
|
+
assert_filter( 'AP – Greenlake', 'AP – Greenlake' )
|
48
|
+
assert_filter( 'you’re', 'you’re' )
|
49
|
+
end
|
50
|
+
|
51
|
+
def assert_filter( output, input )
|
52
|
+
map = UniMap.new
|
53
|
+
map.summary = input
|
54
|
+
assert( FILTER.filter( map ) )
|
55
|
+
assert_equal( output, map.summary.to_s, "From: #{input}" )
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|