iudex-core 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require 'erb'
21
+
22
+ # Generator for TLDSets.java from input effective_tld_name.dat
23
+ # See http://publicsuffix.org/
24
+ class TLDSetGenerator
25
+
26
+ attr_reader :tlds, :tld_parents, :reg_exceptions
27
+
28
+ BASEDIR = File.dirname( __FILE__ )
29
+
30
+ INPUT_DAT = File.join( BASEDIR, 'effective_tld_name.dat' )
31
+
32
+ JAVA_OUT = File.join( BASEDIR, '..', 'src',
33
+ 'main', 'java', 'iudex', 'core', 'TLDSets.java' )
34
+
35
+ def run( tld_file = INPUT_DAT, java_file = JAVA_OUT )
36
+ parse( tld_file )
37
+ generate_java( java_file )
38
+ end
39
+
40
+ def parse( tld_file )
41
+ @tlds = []
42
+ @tld_parents = []
43
+ @reg_exceptions = []
44
+
45
+ open( tld_file, "r" ) do |fin|
46
+ fin.each do |line|
47
+ case line
48
+ when %r{^\s*//}, /^\s*$/
49
+ # ignore comment, empty lines
50
+ when /^\s*([^\s\*\!]+)\s*$/
51
+ @tlds << $1
52
+ when /^\s*\*\.([^\s\*\!]+)\s*$/
53
+ @tld_parents << $1
54
+ when /^\s*\!([^\s\*\!]+)\s*$/
55
+ @reg_exceptions << $1
56
+ else
57
+ raise "Parse ERROR: line [#{line}]"
58
+ end
59
+ end
60
+ end
61
+
62
+ [ @tlds, @tld_parents, @reg_exceptions ]
63
+ end
64
+
65
+ def generate_java( java_file )
66
+ erb_file = File.join( BASEDIR, 'TLDSets.java.erb' )
67
+ template = ERB.new( IO.read( erb_file ), nil, '%' )
68
+
69
+ open( java_file, 'w' ) do |fout|
70
+ fout << template.result( binding )
71
+ end
72
+ end
73
+
74
+ def format_list( list )
75
+ all = list.map { |d| '"' + d + '"' }.join( ", " )
76
+ out = ""
77
+ until( all.empty? )
78
+ out << ' ' * 8
79
+ if all.length > 71
80
+ i = all.rindex( ',', 71 )
81
+ out << all.slice!( 0..i )
82
+ all.lstrip!
83
+ else
84
+ out << all
85
+ all = ""
86
+ end
87
+ out << "\n" unless all.empty?
88
+ end
89
+ out
90
+ end
91
+
92
+ end
93
+
94
+ if $0 == __FILE__
95
+ TLDSetGenerator.new.run( *ARGV )
96
+ end
data/config/config.rb ADDED
@@ -0,0 +1,12 @@
1
+
2
+ Iudex.configure do |c|
3
+
4
+ c.setup_bogus { raise "Shouldn't call" }
5
+
6
+ c.setup_visit_executor do |vx|
7
+ vx.max_threads = 10
8
+ vx.min_host_delay = 2_000
9
+ vx.max_shutdown_wait = 19_000
10
+ end
11
+
12
+ end
data/lib/iudex-core.rb ADDED
@@ -0,0 +1,49 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'gravitext-util'
18
+ require 'iudex-http'
19
+ require 'iudex-filter'
20
+ require 'iudex-barc'
21
+
22
+ require 'iudex-core/base'
23
+
24
+ require 'iudex-core/config'
25
+
26
+ require 'java'
27
+
28
+ module Iudex
29
+ module Core
30
+ require "#{LIB_DIR}/iudex-core-#{VERSION}.jar"
31
+
32
+ import 'iudex.core.ContentKeys'
33
+ import 'iudex.core.ContentSource'
34
+ import 'iudex.core.VisitExecutor'
35
+ import 'iudex.core.VisitURL'
36
+
37
+ module Filters
38
+ import 'iudex.core.filters.BARCWriter'
39
+ import 'iudex.core.filters.ContentFetcher'
40
+ import 'iudex.core.filters.DateChangeFilter'
41
+ import 'iudex.core.filters.DefaultFilter'
42
+ import 'iudex.core.filters.FutureDateFilter'
43
+ import 'iudex.core.filters.RLDomainFilter'
44
+ import 'iudex.core.filters.TextCtrlWSFilter'
45
+ import 'iudex.core.filters.UHashMDCSetter'
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,23 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module Core
19
+ VERSION = '1.0.0'
20
+
21
+ LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
+ end
23
+ end
@@ -0,0 +1,51 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'optparse'
19
+ require 'hooker'
20
+
21
+ Hooker.log_with { |m| RJack::SLF4J[ 'iudex' ].info( m.rstrip ) }
22
+
23
+ module Iudex
24
+
25
+ # Apply configuration from block
26
+ def self.configure( &block )
27
+ Hooker.with( :iudex, &block )
28
+ end
29
+
30
+ module Core
31
+
32
+ # <b>DEPRECATED:</b> Extensible module defining top level
33
+ # configuration blocks.
34
+ module Config
35
+
36
+ # <b>DEPRECATED:</b> Parse options using an OptionParser,
37
+ # defining (-c)onfig option, and yielding to block for further
38
+ # option handling.
39
+ def self.parse_options( args = ARGV, &block )
40
+ warn( "DEPRECATED parse_options called from #{caller.first.to_s}\n" +
41
+ "Use Hooker.register_config and OptionParser.new instead." )
42
+ parser = OptionParser.new do |opts|
43
+ Hooker.register_config( opts )
44
+ block.call( opts ) if block
45
+ end
46
+ parser.parse!
47
+ end
48
+
49
+ end
50
+ end
51
+ end
Binary file
data/pom.xml ADDED
@@ -0,0 +1,74 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
+
4
+ <modelVersion>4.0.0</modelVersion>
5
+ <groupId>iudex</groupId>
6
+ <artifactId>iudex-core</artifactId>
7
+ <packaging>jar</packaging>
8
+ <version>1.0.0</version>
9
+ <name>Iudex Core System</name>
10
+
11
+ <parent>
12
+ <groupId>iudex</groupId>
13
+ <artifactId>iudex-parent</artifactId>
14
+ <version>1.0</version>
15
+ <relativePath>..</relativePath>
16
+ </parent>
17
+
18
+ <dependencies>
19
+
20
+ <dependency>
21
+ <groupId>org.slf4j</groupId>
22
+ <artifactId>slf4j-api</artifactId>
23
+ </dependency>
24
+
25
+ <dependency>
26
+ <groupId>com.gravitext</groupId>
27
+ <artifactId>gravitext-util</artifactId>
28
+ </dependency>
29
+
30
+ <dependency>
31
+ <groupId>iudex</groupId>
32
+ <artifactId>iudex-filter</artifactId>
33
+ <version>[1.0,1.1)</version>
34
+ </dependency>
35
+
36
+ <dependency>
37
+ <groupId>iudex</groupId>
38
+ <artifactId>iudex-http</artifactId>
39
+ <version>[1.0,1.1)</version>
40
+ </dependency>
41
+
42
+ <dependency>
43
+ <groupId>iudex</groupId>
44
+ <artifactId>iudex-barc</artifactId>
45
+ <version>[1.0,1.1)</version>
46
+ </dependency>
47
+
48
+ <dependency>
49
+ <groupId>junit</groupId>
50
+ <artifactId>junit</artifactId>
51
+ </dependency>
52
+
53
+ <dependency>
54
+ <groupId>ch.qos.logback</groupId>
55
+ <artifactId>logback-classic</artifactId>
56
+ <scope>test</scope>
57
+ </dependency>
58
+
59
+ </dependencies>
60
+
61
+ <build>
62
+ <plugins>
63
+ <plugin>
64
+ <!-- Parent settings -->
65
+ <artifactId>maven-compiler-plugin</artifactId>
66
+ </plugin>
67
+ <plugin>
68
+ <!-- Parent settings -->
69
+ <artifactId>maven-source-plugin</artifactId>
70
+ </plugin>
71
+ </plugins>
72
+ </build>
73
+
74
+ </project>
data/test/setup.rb ADDED
@@ -0,0 +1,40 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
20
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
21
+
22
+ require 'rubygems'
23
+ require 'rjack-logback'
24
+ require 'minitest/unit'
25
+ require 'minitest/autorun'
26
+
27
+ module TestSetup
28
+ include RJack
29
+ Logback.config_console( :stderr => true )
30
+ if ARGV.include?( '--verbose' ) || ARGV.include?( '-v' )
31
+ Logback.root.level = Logback::DEBUG
32
+ end
33
+ end
34
+
35
+ # Make test output logging compatible: no partial lines.
36
+ class TestOut
37
+ def print( *a ); $stdout.puts( *a ); end
38
+ def puts( *a ); $stdout.puts( *a ); end
39
+ end
40
+ MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+ require 'iudex-core'
22
+
23
+ module TestHTTPMocks
24
+ include Iudex::Filter
25
+
26
+ class MockHTTPClient
27
+ include Iudex::HTTP::HTTPClient
28
+ def create_session
29
+ MockSession.new
30
+ end
31
+
32
+ def request( session, handler )
33
+ session.execute( handler )
34
+ end
35
+ end
36
+
37
+ WEAK_ETAG = 'W/"weak-etag"'
38
+
39
+ class MockSession < Iudex::HTTP::HTTPSession
40
+ import 'com.gravitext.util.ByteBufferInputStream'
41
+ import 'java.nio.ByteBuffer'
42
+ include Iudex::HTTP
43
+
44
+ def requestHeaders
45
+ [ ]
46
+ end
47
+
48
+ def responseHeaders
49
+ [ Header.new( "ETag", WEAK_ETAG ) ]
50
+ end
51
+
52
+ def responseCode
53
+ 200
54
+ end
55
+
56
+ def responseStream
57
+ ByteBufferInputStream.new( ByteBuffer::wrap( "".to_java_bytes ) )
58
+ end
59
+
60
+ def statusText
61
+ "status text"
62
+ end
63
+
64
+ def execute( handler )
65
+ handler.handle_success( self )
66
+ end
67
+ end
68
+
69
+ class TestReceiver < FilterBase
70
+ def initialize( &block )
71
+ @block = block
72
+ @log = RJack::SLF4J[ self.class ]
73
+ end
74
+
75
+ def filter( out )
76
+ pretty_log( out )
77
+ @block.call( out )
78
+ end
79
+
80
+ def pretty_log( out )
81
+ @log.debug do
82
+ p = 0
83
+ rep = out.to_s.gsub( /{/ ) do
84
+ "\n" + ( ' ' * (p += 1) ) + "{ "
85
+ end
86
+ rep.gsub( /}/, " }" )
87
+ end
88
+ end
89
+ end
90
+
91
+ end
92
+
93
+ class TestContentFetcher < MiniTest::Unit::TestCase
94
+ include Iudex::Core
95
+ include Iudex::Core::Filters
96
+ include Iudex::Filter::Core
97
+ include Gravitext::HTMap
98
+
99
+ UniMap.define_accessors
100
+
101
+ include TestHTTPMocks
102
+
103
+ def setup
104
+ @fetcher = nil
105
+ end
106
+
107
+ DEFAULT_URL = "http://gravitext.com/test"
108
+
109
+ def test_simple
110
+ inp = create_content
111
+ fetch( inp ) do |out|
112
+ assert_equal( DEFAULT_URL, out.url.to_s )
113
+ assert_equal( 200, out.status )
114
+ assert_equal( WEAK_ETAG, out.etag )
115
+ assert( out.source )
116
+ end
117
+ end
118
+
119
+ def test_304
120
+ client = MockHTTPClient.new
121
+ def client.request( session, handler )
122
+ handler.handle_error( session, 304 )
123
+ end
124
+ fetch( create_content, client ) do |out|
125
+ assert_equal( DEFAULT_URL, out.url.to_s )
126
+ assert_equal( 304, out.status )
127
+ assert_nil( out.etag )
128
+ assert_nil( out.source )
129
+ end
130
+ end
131
+
132
+ REDIRECT_URL = "http://gravitext.com/redirect#foo"
133
+ REDIRECT_NORM = "http://gravitext.com/redirect"
134
+
135
+ def test_redirect
136
+ client = MockHTTPClient.new
137
+ def client.create_session
138
+ s = MockSession.new
139
+ def s.execute( handler )
140
+ self.url = REDIRECT_URL
141
+ super
142
+ end
143
+ s
144
+ end
145
+ fetch( create_content, client ) do |out|
146
+ assert_equal( REDIRECT_NORM, out.url.to_s )
147
+ assert_equal( 200, out.status )
148
+
149
+ ref = out.referer
150
+
151
+ assert_equal( DEFAULT_URL, ref.url.to_s )
152
+ assert_equal( 302, ref.status )
153
+ assert_equal( REDIRECT_NORM, ref.referent.url.to_s )
154
+ end
155
+ end
156
+
157
+ import "java.net.UnknownHostException"
158
+ import "java.io.IOException"
159
+
160
+ def test_connect_error
161
+ client = MockHTTPClient.new
162
+ def client.create_session
163
+ s = MockSession.new
164
+ def s.execute( handler )
165
+ handler.handle_exception( self,
166
+ UnknownHostException.new( "foobar.com" ) )
167
+ end
168
+ def s.responseCode
169
+ nil
170
+ end
171
+ def s.responseHeaders
172
+ nil
173
+ end
174
+ s
175
+ end
176
+ fetch( create_content, client ) do |out|
177
+ assert_equal( -1, out.status )
178
+ assert_nil( out.response_headers )
179
+ assert( out.reason =~ /UnknownHostException/ )
180
+ end
181
+ end
182
+
183
+ def fetch( content, client = MockHTTPClient.new, &block )
184
+ rec = TestReceiver.new( &block )
185
+ cf = ContentFetcher.new( client,
186
+ FilterChain.new( "test-rec", [ rec ] ) )
187
+ cf.filter( content )
188
+ end
189
+
190
+ def create_content( url = DEFAULT_URL )
191
+ content = UniMap.new
192
+ content.url = visit_url( url )
193
+ content
194
+ end
195
+
196
+ def visit_url( url )
197
+ VisitURL.normalize( url )
198
+ end
199
+
200
+ end