iudex-da 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +32 -0
- data/README.rdoc +30 -0
- data/Rakefile +49 -0
- data/bin/iudex-da-generate-test-data +127 -0
- data/bin/iudex-da-import +62 -0
- data/bin/iudex-da-simhash-dump +112 -0
- data/bin/iudex-migrate +66 -0
- data/config/config.rb +14 -0
- data/db/0010_base_urls.rb +84 -0
- data/db/0020_add_feed_metadata.rb +37 -0
- data/db/0021_more_feed_text.rb +29 -0
- data/db/0030_add_priority.rb +28 -0
- data/db/0040_add_visit_after.rb +30 -0
- data/db/0050_add_cache_location.rb +32 -0
- data/db/0060_url_indexes.rb +41 -0
- data/db/0070_add_created_at.rb +28 -0
- data/db/0080_add_simhash.rb +33 -0
- data/lib/iudex-da.rb +40 -0
- data/lib/iudex-da/ar.rb +48 -0
- data/lib/iudex-da/base.rb +23 -0
- data/lib/iudex-da/config.rb +31 -0
- data/lib/iudex-da/factory_helper.rb +53 -0
- data/lib/iudex-da/importer.rb +91 -0
- data/lib/iudex-da/iudex-da-1.0.0.jar +0 -0
- data/lib/iudex-da/key_helper.rb +33 -0
- data/lib/iudex-da/pool_data_source_factory.rb +108 -0
- data/pom.xml +86 -0
- data/test/setup.rb +34 -0
- data/test/test_migrate.rb +41 -0
- data/test/test_poll_work.rb +132 -0
- data/test/test_pool_factory.rb +59 -0
- metadata +203 -0
data/config/config.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
Iudex.configure do |c|
|
3
|
+
|
4
|
+
# Set DA connection properties
|
5
|
+
c.setup_connect_props do
|
6
|
+
{ :host => 'localhost',
|
7
|
+
:database => 'iudex_test',
|
8
|
+
:username => 'iudex',
|
9
|
+
:ds_pool => { :max_active => 4,
|
10
|
+
:max_idle => 2 },
|
11
|
+
:loglevel => 2 }
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
# http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
|
18
|
+
# http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
|
19
|
+
|
20
|
+
# Base Urls table schema
|
21
|
+
class BaseUrls < ActiveRecord::Migration
|
22
|
+
def self.up
|
23
|
+
create_table( 'urls', :id => false ) {}
|
24
|
+
|
25
|
+
add_column( 'urls', 'uhash', :text, :null => false )
|
26
|
+
# 23 byte ASCII PRIMARY KEY SHA-1 hash fragment of URL
|
27
|
+
# (Note :limit not useful.)
|
28
|
+
|
29
|
+
add_column( 'urls', 'url', :text, :null => false )
|
30
|
+
# Complete normalized url (exactly as used for uhash)
|
31
|
+
|
32
|
+
add_column( 'urls', 'host', :text, :null => false )
|
33
|
+
# Normalized host portion of URL
|
34
|
+
|
35
|
+
add_column( 'urls', 'type', :text, :null => false )
|
36
|
+
# FEED, PAGE, ROBOTS, SITEMAP
|
37
|
+
# Potentially speculative (i.e. "PAGE" before visited)
|
38
|
+
# FIXME: Or REDIRECT here instead of status?
|
39
|
+
|
40
|
+
add_column( 'urls', 'etag', :text )
|
41
|
+
# HTTP ETag header used for subsequent conditional GET
|
42
|
+
# Should only be on 200 and related HTTP status, not redirect
|
43
|
+
|
44
|
+
add_column( 'urls', 'last_visit', 'timestamp with time zone' )
|
45
|
+
# Time of last visit (and thus last type,status,reason,etc.)
|
46
|
+
|
47
|
+
add_column( 'urls', 'status', :integer )
|
48
|
+
# HTTP status code or special (negative) status mapping
|
49
|
+
# null : Not yet visited
|
50
|
+
# -1 : Connection Failed
|
51
|
+
# 4xx : Permanent Failures
|
52
|
+
# 5xx : Transient server error
|
53
|
+
# 200 : Success
|
54
|
+
# 304 : Not Modified
|
55
|
+
# 301,302 : Redirect
|
56
|
+
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
57
|
+
# Compare to: http://crawler.archive.org/articles/user_manual/glossary.html#statuscodes
|
58
|
+
|
59
|
+
add_column( 'urls', 'pass', :boolean )
|
60
|
+
# null : Not yet processed (i.e. visit failed)
|
61
|
+
# false : Rejected by processing (for reason), DELETE required
|
62
|
+
# true : Fully Processed
|
63
|
+
|
64
|
+
add_column( 'urls', 'reason', :text )
|
65
|
+
# null : None
|
66
|
+
# DUPE : Duplicate of referent
|
67
|
+
# rejection filter (intended as key)
|
68
|
+
|
69
|
+
add_column( 'urls', 'referent', :text )
|
70
|
+
# null : None
|
71
|
+
# uhash of url this is refering to
|
72
|
+
# (includes status:REDIRECT, reason:DUPE, etc.)
|
73
|
+
|
74
|
+
add_column( 'urls', 'referer', :text )
|
75
|
+
# null : None
|
76
|
+
# uhash of url this was refered from. (i.e. the feed URL)
|
77
|
+
|
78
|
+
execute( "ALTER TABLE urls ADD PRIMARY KEY (uhash)" )
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.down
|
82
|
+
drop_table 'urls'
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class AddFeedMetadata < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'title', :text )
|
21
|
+
# PAGE,FEED title
|
22
|
+
|
23
|
+
add_column( 'urls', 'ref_pub_date', 'timestamp with time zone' )
|
24
|
+
# (Latest) published date as provided from feed (may be ahead of
|
25
|
+
# or set before pub_date, below).
|
26
|
+
|
27
|
+
add_column( 'urls', 'pub_date', 'timestamp with time zone' )
|
28
|
+
# (Latest) published date as processed
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.down
|
32
|
+
remove_column( 'urls', 'title' )
|
33
|
+
remove_column( 'urls', 'ref_pub_date' )
|
34
|
+
remove_column( 'urls', 'pub_date' )
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class MoreFeedText < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'summary', :text )
|
21
|
+
add_column( 'urls', 'content', :text )
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.down
|
25
|
+
remove_column( 'urls', 'summary' )
|
26
|
+
remove_column( 'urls', 'content' )
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class AddPriority < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'priority', 'real', :null => false, :default => 0.0 )
|
21
|
+
# Prioritization of next visit, range -INF,+INF
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.down
|
25
|
+
remove_column( 'urls', 'priority' )
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class AddVisitAfter < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'next_visit_after', 'timestamp with time zone' )
|
21
|
+
execute 'ALTER TABLE urls ALTER COLUMN next_visit_after SET DEFAULT now()'
|
22
|
+
# null: never visit (terminal result)
|
23
|
+
# Don't visit again before the specified date.
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.down
|
27
|
+
remove_column( 'urls', 'next_visit_after' )
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class AddCacheLocation < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'cache_file', :integer, :limit => 4 )
|
21
|
+
# 32-bit file number
|
22
|
+
|
23
|
+
add_column( 'urls', 'cache_file_offset', :integer, :limit => 8 )
|
24
|
+
# 64-bit byte offset within file
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.down
|
28
|
+
remove_column 'urls', 'cache_file'
|
29
|
+
remove_column 'urls', 'cache_file_offset'
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
# http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
|
18
|
+
# http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
|
19
|
+
|
20
|
+
# Indexes for urls table
|
21
|
+
class UrlIndexes < ActiveRecord::Migration
|
22
|
+
def self.up
|
23
|
+
# FIXME: Disabled for now. Which are helpful?
|
24
|
+
|
25
|
+
# add_index( 'urls', [ 'host' ] )
|
26
|
+
# Used by (obsolesced) LIMIT/sub-query based work poll
|
27
|
+
|
28
|
+
# add_index( 'urls', [ 'priority' ] )
|
29
|
+
# FIXME: Consider partial index, e.g. WHERE next_visit_after IS NOT NULL?
|
30
|
+
# FIXME: Consider a combined index 'host', 'priority'?
|
31
|
+
|
32
|
+
# add_index( 'urls', [ 'next_visit_after' ] )
|
33
|
+
# Used by (obsolesced) LIMIT/sub-query based work poll
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.down
|
37
|
+
# remove_index( 'urls', 'host' )
|
38
|
+
# remove_index( 'urls', 'priority' )
|
39
|
+
# remove_index( 'urls', 'next_visit_after' )
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class AddCreatedAt < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'created_at', 'timestamp with time zone' )
|
21
|
+
execute 'ALTER TABLE urls ALTER COLUMN created_at SET DEFAULT now()'
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.down
|
25
|
+
remove_column( 'urls', 'created_at' )
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
class AddSimhash < ActiveRecord::Migration
|
18
|
+
|
19
|
+
def self.up
|
20
|
+
add_column( 'urls', 'simhash', :integer, :limit => 8 )
|
21
|
+
# A simhash signature as a signed 8-byte long (should be
|
22
|
+
# compatible with java long).
|
23
|
+
|
24
|
+
add_index( 'urls', [ 'simhash' ] )
|
25
|
+
# And its index
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.down
|
29
|
+
remove_index( 'urls', 'simhash' )
|
30
|
+
remove_column( 'urls', 'simhash' )
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
data/lib/iudex-da.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
|
19
|
+
require 'rjack-commons-dbcp'
|
20
|
+
require 'rjack-commons-dbutils'
|
21
|
+
|
22
|
+
require 'iudex-da/base'
|
23
|
+
require 'iudex-da/config'
|
24
|
+
|
25
|
+
require 'java'
|
26
|
+
|
27
|
+
module Iudex
|
28
|
+
module DA
|
29
|
+
|
30
|
+
require "#{LIB_DIR}/iudex-da-#{VERSION}.jar"
|
31
|
+
|
32
|
+
import 'iudex.da.WorkPoller'
|
33
|
+
import 'iudex.da.ContentMapper'
|
34
|
+
|
35
|
+
module Filters
|
36
|
+
import 'iudex.da.filters.UpdateFilter'
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
data/lib/iudex-da/ar.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'rjack-slf4j'
|
18
|
+
require 'iudex-da/config'
|
19
|
+
require 'active_record'
|
20
|
+
require 'hooker'
|
21
|
+
|
22
|
+
module Iudex::DA
|
23
|
+
|
24
|
+
def self.setup
|
25
|
+
log = RJack::SLF4J[ "iudex.da.ActiveRecord" ]
|
26
|
+
conf = Hooker.merge( [ :iudex, :connect_props ], CONFIG )
|
27
|
+
log.info { "Connecting: #{ conf.inspect }" }
|
28
|
+
|
29
|
+
ActiveRecord::Base.logger = log
|
30
|
+
ActiveRecord::Base.establish_connection( conf )
|
31
|
+
end
|
32
|
+
|
33
|
+
setup #FIXME: Require explicit setup for use?
|
34
|
+
|
35
|
+
def migrate( target_version = nil )
|
36
|
+
ActiveRecord::Migrator.migrate( File.join( LIB_DIR, '..', '..', 'db' ),
|
37
|
+
target_version )
|
38
|
+
#FIXME: Support additional migration directories?
|
39
|
+
end
|
40
|
+
|
41
|
+
module_function :migrate
|
42
|
+
|
43
|
+
class Url < ActiveRecord::Base
|
44
|
+
set_primary_key :uhash
|
45
|
+
set_inheritance_column :object_type # since "type" used already
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|