iudex-core 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require 'erb'
21
+
22
+ # Generator for TLDSets.java from input effective_tld_name.dat
23
+ # See http://publicsuffix.org/
24
+ class TLDSetGenerator
25
+
26
+ attr_reader :tlds, :tld_parents, :reg_exceptions
27
+
28
+ BASEDIR = File.dirname( __FILE__ )
29
+
30
+ INPUT_DAT = File.join( BASEDIR, 'effective_tld_name.dat' )
31
+
32
+ JAVA_OUT = File.join( BASEDIR, '..', 'src',
33
+ 'main', 'java', 'iudex', 'core', 'TLDSets.java' )
34
+
35
+ def run( tld_file = INPUT_DAT, java_file = JAVA_OUT )
36
+ parse( tld_file )
37
+ generate_java( java_file )
38
+ end
39
+
40
+ def parse( tld_file )
41
+ @tlds = []
42
+ @tld_parents = []
43
+ @reg_exceptions = []
44
+
45
+ open( tld_file, "r" ) do |fin|
46
+ fin.each do |line|
47
+ case line
48
+ when %r{^\s*//}, /^\s*$/
49
+ # ignore comment, empty lines
50
+ when /^\s*([^\s\*\!]+)\s*$/
51
+ @tlds << $1
52
+ when /^\s*\*\.([^\s\*\!]+)\s*$/
53
+ @tld_parents << $1
54
+ when /^\s*\!([^\s\*\!]+)\s*$/
55
+ @reg_exceptions << $1
56
+ else
57
+ raise "Parse ERROR: line [#{line}]"
58
+ end
59
+ end
60
+ end
61
+
62
+ [ @tlds, @tld_parents, @reg_exceptions ]
63
+ end
64
+
65
+ def generate_java( java_file )
66
+ erb_file = File.join( BASEDIR, 'TLDSets.java.erb' )
67
+ template = ERB.new( IO.read( erb_file ), nil, '%' )
68
+
69
+ open( java_file, 'w' ) do |fout|
70
+ fout << template.result( binding )
71
+ end
72
+ end
73
+
74
+ def format_list( list )
75
+ all = list.map { |d| '"' + d + '"' }.join( ", " )
76
+ out = ""
77
+ until( all.empty? )
78
+ out << ' ' * 8
79
+ if all.length > 71
80
+ i = all.rindex( ',', 71 )
81
+ out << all.slice!( 0..i )
82
+ all.lstrip!
83
+ else
84
+ out << all
85
+ all = ""
86
+ end
87
+ out << "\n" unless all.empty?
88
+ end
89
+ out
90
+ end
91
+
92
+ end
93
+
94
+ if $0 == __FILE__
95
+ TLDSetGenerator.new.run( *ARGV )
96
+ end
data/config/config.rb ADDED
@@ -0,0 +1,12 @@
1
+
2
+ Iudex.configure do |c|
3
+
4
+ c.setup_bogus { raise "Shouldn't call" }
5
+
6
+ c.setup_visit_executor do |vx|
7
+ vx.max_threads = 10
8
+ vx.min_host_delay = 2_000
9
+ vx.max_shutdown_wait = 19_000
10
+ end
11
+
12
+ end
data/lib/iudex-core.rb ADDED
@@ -0,0 +1,49 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'gravitext-util'
18
+ require 'iudex-http'
19
+ require 'iudex-filter'
20
+ require 'iudex-barc'
21
+
22
+ require 'iudex-core/base'
23
+
24
+ require 'iudex-core/config'
25
+
26
+ require 'java'
27
+
28
+ module Iudex
29
+ module Core
30
+ require "#{LIB_DIR}/iudex-core-#{VERSION}.jar"
31
+
32
+ import 'iudex.core.ContentKeys'
33
+ import 'iudex.core.ContentSource'
34
+ import 'iudex.core.VisitExecutor'
35
+ import 'iudex.core.VisitURL'
36
+
37
+ module Filters
38
+ import 'iudex.core.filters.BARCWriter'
39
+ import 'iudex.core.filters.ContentFetcher'
40
+ import 'iudex.core.filters.DateChangeFilter'
41
+ import 'iudex.core.filters.DefaultFilter'
42
+ import 'iudex.core.filters.FutureDateFilter'
43
+ import 'iudex.core.filters.RLDomainFilter'
44
+ import 'iudex.core.filters.TextCtrlWSFilter'
45
+ import 'iudex.core.filters.UHashMDCSetter'
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,23 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module Core
19
+ VERSION = '1.0.0'
20
+
21
+ LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
+ end
23
+ end
@@ -0,0 +1,51 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'optparse'
19
+ require 'hooker'
20
+
21
+ Hooker.log_with { |m| RJack::SLF4J[ 'iudex' ].info( m.rstrip ) }
22
+
23
+ module Iudex
24
+
25
+ # Apply configuration from block
26
+ def self.configure( &block )
27
+ Hooker.with( :iudex, &block )
28
+ end
29
+
30
+ module Core
31
+
32
+ # <b>DEPRECATED:</b> Extensible module defining top level
33
+ # configuration blocks.
34
+ module Config
35
+
36
+ # <b>DEPRECATED:</b> Parse options using an OptionParser,
37
+ # defining (-c)onfig option, and yielding to block for further
38
+ # option handling.
39
+ def self.parse_options( args = ARGV, &block )
40
+ warn( "DEPRECATED parse_options called from #{caller.first.to_s}\n" +
41
+ "Use Hooker.register_config and OptionParser.new instead." )
42
+ parser = OptionParser.new do |opts|
43
+ Hooker.register_config( opts )
44
+ block.call( opts ) if block
45
+ end
46
+ parser.parse!
47
+ end
48
+
49
+ end
50
+ end
51
+ end
Binary file
data/pom.xml ADDED
@@ -0,0 +1,74 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
+
4
+ <modelVersion>4.0.0</modelVersion>
5
+ <groupId>iudex</groupId>
6
+ <artifactId>iudex-core</artifactId>
7
+ <packaging>jar</packaging>
8
+ <version>1.0.0</version>
9
+ <name>Iudex Core System</name>
10
+
11
+ <parent>
12
+ <groupId>iudex</groupId>
13
+ <artifactId>iudex-parent</artifactId>
14
+ <version>1.0</version>
15
+ <relativePath>..</relativePath>
16
+ </parent>
17
+
18
+ <dependencies>
19
+
20
+ <dependency>
21
+ <groupId>org.slf4j</groupId>
22
+ <artifactId>slf4j-api</artifactId>
23
+ </dependency>
24
+
25
+ <dependency>
26
+ <groupId>com.gravitext</groupId>
27
+ <artifactId>gravitext-util</artifactId>
28
+ </dependency>
29
+
30
+ <dependency>
31
+ <groupId>iudex</groupId>
32
+ <artifactId>iudex-filter</artifactId>
33
+ <version>[1.0,1.1)</version>
34
+ </dependency>
35
+
36
+ <dependency>
37
+ <groupId>iudex</groupId>
38
+ <artifactId>iudex-http</artifactId>
39
+ <version>[1.0,1.1)</version>
40
+ </dependency>
41
+
42
+ <dependency>
43
+ <groupId>iudex</groupId>
44
+ <artifactId>iudex-barc</artifactId>
45
+ <version>[1.0,1.1)</version>
46
+ </dependency>
47
+
48
+ <dependency>
49
+ <groupId>junit</groupId>
50
+ <artifactId>junit</artifactId>
51
+ </dependency>
52
+
53
+ <dependency>
54
+ <groupId>ch.qos.logback</groupId>
55
+ <artifactId>logback-classic</artifactId>
56
+ <scope>test</scope>
57
+ </dependency>
58
+
59
+ </dependencies>
60
+
61
+ <build>
62
+ <plugins>
63
+ <plugin>
64
+ <!-- Parent settings -->
65
+ <artifactId>maven-compiler-plugin</artifactId>
66
+ </plugin>
67
+ <plugin>
68
+ <!-- Parent settings -->
69
+ <artifactId>maven-source-plugin</artifactId>
70
+ </plugin>
71
+ </plugins>
72
+ </build>
73
+
74
+ </project>
data/test/setup.rb ADDED
@@ -0,0 +1,40 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
20
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
21
+
22
+ require 'rubygems'
23
+ require 'rjack-logback'
24
+ require 'minitest/unit'
25
+ require 'minitest/autorun'
26
+
27
+ module TestSetup
28
+ include RJack
29
+ Logback.config_console( :stderr => true )
30
+ if ARGV.include?( '--verbose' ) || ARGV.include?( '-v' )
31
+ Logback.root.level = Logback::DEBUG
32
+ end
33
+ end
34
+
35
+ # Make test output logging compatible: no partial lines.
36
+ class TestOut
37
+ def print( *a ); $stdout.puts( *a ); end
38
+ def puts( *a ); $stdout.puts( *a ); end
39
+ end
40
+ MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+ require 'iudex-core'
22
+
23
+ module TestHTTPMocks
24
+ include Iudex::Filter
25
+
26
+ class MockHTTPClient
27
+ include Iudex::HTTP::HTTPClient
28
+ def create_session
29
+ MockSession.new
30
+ end
31
+
32
+ def request( session, handler )
33
+ session.execute( handler )
34
+ end
35
+ end
36
+
37
+ WEAK_ETAG = 'W/"weak-etag"'
38
+
39
+ class MockSession < Iudex::HTTP::HTTPSession
40
+ import 'com.gravitext.util.ByteBufferInputStream'
41
+ import 'java.nio.ByteBuffer'
42
+ include Iudex::HTTP
43
+
44
+ def requestHeaders
45
+ [ ]
46
+ end
47
+
48
+ def responseHeaders
49
+ [ Header.new( "ETag", WEAK_ETAG ) ]
50
+ end
51
+
52
+ def responseCode
53
+ 200
54
+ end
55
+
56
+ def responseStream
57
+ ByteBufferInputStream.new( ByteBuffer::wrap( "".to_java_bytes ) )
58
+ end
59
+
60
+ def statusText
61
+ "status text"
62
+ end
63
+
64
+ def execute( handler )
65
+ handler.handle_success( self )
66
+ end
67
+ end
68
+
69
+ class TestReceiver < FilterBase
70
+ def initialize( &block )
71
+ @block = block
72
+ @log = RJack::SLF4J[ self.class ]
73
+ end
74
+
75
+ def filter( out )
76
+ pretty_log( out )
77
+ @block.call( out )
78
+ end
79
+
80
+ def pretty_log( out )
81
+ @log.debug do
82
+ p = 0
83
+ rep = out.to_s.gsub( /{/ ) do
84
+ "\n" + ( ' ' * (p += 1) ) + "{ "
85
+ end
86
+ rep.gsub( /}/, " }" )
87
+ end
88
+ end
89
+ end
90
+
91
+ end
92
+
93
+ class TestContentFetcher < MiniTest::Unit::TestCase
94
+ include Iudex::Core
95
+ include Iudex::Core::Filters
96
+ include Iudex::Filter::Core
97
+ include Gravitext::HTMap
98
+
99
+ UniMap.define_accessors
100
+
101
+ include TestHTTPMocks
102
+
103
+ def setup
104
+ @fetcher = nil
105
+ end
106
+
107
+ DEFAULT_URL = "http://gravitext.com/test"
108
+
109
+ def test_simple
110
+ inp = create_content
111
+ fetch( inp ) do |out|
112
+ assert_equal( DEFAULT_URL, out.url.to_s )
113
+ assert_equal( 200, out.status )
114
+ assert_equal( WEAK_ETAG, out.etag )
115
+ assert( out.source )
116
+ end
117
+ end
118
+
119
+ def test_304
120
+ client = MockHTTPClient.new
121
+ def client.request( session, handler )
122
+ handler.handle_error( session, 304 )
123
+ end
124
+ fetch( create_content, client ) do |out|
125
+ assert_equal( DEFAULT_URL, out.url.to_s )
126
+ assert_equal( 304, out.status )
127
+ assert_nil( out.etag )
128
+ assert_nil( out.source )
129
+ end
130
+ end
131
+
132
+ REDIRECT_URL = "http://gravitext.com/redirect#foo"
133
+ REDIRECT_NORM = "http://gravitext.com/redirect"
134
+
135
+ def test_redirect
136
+ client = MockHTTPClient.new
137
+ def client.create_session
138
+ s = MockSession.new
139
+ def s.execute( handler )
140
+ self.url = REDIRECT_URL
141
+ super
142
+ end
143
+ s
144
+ end
145
+ fetch( create_content, client ) do |out|
146
+ assert_equal( REDIRECT_NORM, out.url.to_s )
147
+ assert_equal( 200, out.status )
148
+
149
+ ref = out.referer
150
+
151
+ assert_equal( DEFAULT_URL, ref.url.to_s )
152
+ assert_equal( 302, ref.status )
153
+ assert_equal( REDIRECT_NORM, ref.referent.url.to_s )
154
+ end
155
+ end
156
+
157
+ import "java.net.UnknownHostException"
158
+ import "java.io.IOException"
159
+
160
+ def test_connect_error
161
+ client = MockHTTPClient.new
162
+ def client.create_session
163
+ s = MockSession.new
164
+ def s.execute( handler )
165
+ handler.handle_exception( self,
166
+ UnknownHostException.new( "foobar.com" ) )
167
+ end
168
+ def s.responseCode
169
+ nil
170
+ end
171
+ def s.responseHeaders
172
+ nil
173
+ end
174
+ s
175
+ end
176
+ fetch( create_content, client ) do |out|
177
+ assert_equal( -1, out.status )
178
+ assert_nil( out.response_headers )
179
+ assert( out.reason =~ /UnknownHostException/ )
180
+ end
181
+ end
182
+
183
+ def fetch( content, client = MockHTTPClient.new, &block )
184
+ rec = TestReceiver.new( &block )
185
+ cf = ContentFetcher.new( client,
186
+ FilterChain.new( "test-rec", [ rec ] ) )
187
+ cf.filter( content )
188
+ end
189
+
190
+ def create_content( url = DEFAULT_URL )
191
+ content = UniMap.new
192
+ content.url = visit_url( url )
193
+ content
194
+ end
195
+
196
+ def visit_url( url )
197
+ VisitURL.normalize( url )
198
+ end
199
+
200
+ end