iudex-da 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/config/config.rb ADDED
@@ -0,0 +1,14 @@
1
+
2
+ Iudex.configure do |c|
3
+
4
+ # Set DA connection properties
5
+ c.setup_connect_props do
6
+ { :host => 'localhost',
7
+ :database => 'iudex_test',
8
+ :username => 'iudex',
9
+ :ds_pool => { :max_active => 4,
10
+ :max_idle => 2 },
11
+ :loglevel => 2 }
12
+ end
13
+
14
+ end
@@ -0,0 +1,84 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ # http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
18
+ # http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
19
+
20
+ # Base Urls table schema
21
+ class BaseUrls < ActiveRecord::Migration
22
+ def self.up
23
+ create_table( 'urls', :id => false ) {}
24
+
25
+ add_column( 'urls', 'uhash', :text, :null => false )
26
+ # 23 byte ASCII PRIMARY KEY SHA-1 hash fragment of URL
27
+ # (Note :limit not useful.)
28
+
29
+ add_column( 'urls', 'url', :text, :null => false )
30
+ # Complete normalized url (exactly as used for uhash)
31
+
32
+ add_column( 'urls', 'host', :text, :null => false )
33
+ # Normalized host portion of URL
34
+
35
+ add_column( 'urls', 'type', :text, :null => false )
36
+ # FEED, PAGE, ROBOTS, SITEMAP
37
+ # Potentially speculative (i.e. "PAGE" before visited)
38
+ # FIXME: Or REDIRECT here instead of status?
39
+
40
+ add_column( 'urls', 'etag', :text )
41
+ # HTTP ETag header used for subsequent conditional GET
42
+ # Should only be on 200 and related HTTP status, not redirect
43
+
44
+ add_column( 'urls', 'last_visit', 'timestamp with time zone' )
45
+ # Time of last visit (and thus last type,status,reason,etc.)
46
+
47
+ add_column( 'urls', 'status', :integer )
48
+ # HTTP status code or special (negative) status mapping
49
+ # null : Not yet visited
50
+ # -1 : Connection Failed
51
+ # 4xx : Permanent Failures
52
+ # 5xx : Transient server error
53
+ # 200 : Success
54
+ # 304 : Not Modified
55
+ # 301,302 : Redirect
56
+ # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
57
+ # Compare to: http://crawler.archive.org/articles/user_manual/glossary.html#statuscodes
58
+
59
+ add_column( 'urls', 'pass', :boolean )
60
+ # null : Not yet processed (i.e. visit failed)
61
+ # false : Rejected by processing (for reason), DELETE required
62
+ # true : Fully Processed
63
+
64
+ add_column( 'urls', 'reason', :text )
65
+ # null : None
66
+ # DUPE : Duplicate of referent
67
+ # rejection filter (intended as key)
68
+
69
+ add_column( 'urls', 'referent', :text )
70
+ # null : None
71
+ # uhash of url this is refering to
72
+ # (includes status:REDIRECT, reason:DUPE, etc.)
73
+
74
+ add_column( 'urls', 'referer', :text )
75
+ # null : None
76
+ # uhash of url this was refered from. (i.e. the feed URL)
77
+
78
+ execute( "ALTER TABLE urls ADD PRIMARY KEY (uhash)" )
79
+ end
80
+
81
+ def self.down
82
+ drop_table 'urls'
83
+ end
84
+ end
@@ -0,0 +1,37 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class AddFeedMetadata < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'title', :text )
21
+ # PAGE,FEED title
22
+
23
+ add_column( 'urls', 'ref_pub_date', 'timestamp with time zone' )
24
+ # (Latest) published date as provided from feed (may be ahead of
25
+ # or set before pub_date, below).
26
+
27
+ add_column( 'urls', 'pub_date', 'timestamp with time zone' )
28
+ # (Latest) published date as processed
29
+ end
30
+
31
+ def self.down
32
+ remove_column( 'urls', 'title' )
33
+ remove_column( 'urls', 'ref_pub_date' )
34
+ remove_column( 'urls', 'pub_date' )
35
+ end
36
+
37
+ end
@@ -0,0 +1,29 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class MoreFeedText < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'summary', :text )
21
+ add_column( 'urls', 'content', :text )
22
+ end
23
+
24
+ def self.down
25
+ remove_column( 'urls', 'summary' )
26
+ remove_column( 'urls', 'content' )
27
+ end
28
+
29
+ end
@@ -0,0 +1,28 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class AddPriority < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'priority', 'real', :null => false, :default => 0.0 )
21
+ # Prioritization of next visit, range -INF,+INF
22
+ end
23
+
24
+ def self.down
25
+ remove_column( 'urls', 'priority' )
26
+ end
27
+
28
+ end
@@ -0,0 +1,30 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class AddVisitAfter < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'next_visit_after', 'timestamp with time zone' )
21
+ execute 'ALTER TABLE urls ALTER COLUMN next_visit_after SET DEFAULT now()'
22
+ # null: never visit (terminal result)
23
+ # Don't visit again before the specified date.
24
+ end
25
+
26
+ def self.down
27
+ remove_column( 'urls', 'next_visit_after' )
28
+ end
29
+
30
+ end
@@ -0,0 +1,32 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class AddCacheLocation < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'cache_file', :integer, :limit => 4 )
21
+ # 32-bit file number
22
+
23
+ add_column( 'urls', 'cache_file_offset', :integer, :limit => 8 )
24
+ # 64-bit byte offset within file
25
+ end
26
+
27
+ def self.down
28
+ remove_column 'urls', 'cache_file'
29
+ remove_column 'urls', 'cache_file_offset'
30
+ end
31
+
32
+ end
@@ -0,0 +1,41 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ # http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
18
+ # http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
19
+
20
+ # Indexes for urls table
21
+ class UrlIndexes < ActiveRecord::Migration
22
+ def self.up
23
+ # FIXME: Disabled for now. Which are helpful?
24
+
25
+ # add_index( 'urls', [ 'host' ] )
26
+ # Used by (obsolesced) LIMIT/sub-query based work poll
27
+
28
+ # add_index( 'urls', [ 'priority' ] )
29
+ # FIXME: Consider partial index, e.g. WHERE next_visit_after IS NOT NULL?
30
+ # FIXME: Consider a combined index 'host', 'priority'?
31
+
32
+ # add_index( 'urls', [ 'next_visit_after' ] )
33
+ # Used by (obsolesced) LIMIT/sub-query based work poll
34
+ end
35
+
36
+ def self.down
37
+ # remove_index( 'urls', 'host' )
38
+ # remove_index( 'urls', 'priority' )
39
+ # remove_index( 'urls', 'next_visit_after' )
40
+ end
41
+ end
@@ -0,0 +1,28 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class AddCreatedAt < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'created_at', 'timestamp with time zone' )
21
+ execute 'ALTER TABLE urls ALTER COLUMN created_at SET DEFAULT now()'
22
+ end
23
+
24
+ def self.down
25
+ remove_column( 'urls', 'created_at' )
26
+ end
27
+
28
+ end
@@ -0,0 +1,33 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ class AddSimhash < ActiveRecord::Migration
18
+
19
+ def self.up
20
+ add_column( 'urls', 'simhash', :integer, :limit => 8 )
21
+ # A simhash signature as a signed 8-byte long (should be
22
+ # compatible with java long).
23
+
24
+ add_index( 'urls', [ 'simhash' ] )
25
+ # And its index
26
+ end
27
+
28
+ def self.down
29
+ remove_index( 'urls', 'simhash' )
30
+ remove_column( 'urls', 'simhash' )
31
+ end
32
+
33
+ end
data/lib/iudex-da.rb ADDED
@@ -0,0 +1,40 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-core'
18
+
19
+ require 'rjack-commons-dbcp'
20
+ require 'rjack-commons-dbutils'
21
+
22
+ require 'iudex-da/base'
23
+ require 'iudex-da/config'
24
+
25
+ require 'java'
26
+
27
+ module Iudex
28
+ module DA
29
+
30
+ require "#{LIB_DIR}/iudex-da-#{VERSION}.jar"
31
+
32
+ import 'iudex.da.WorkPoller'
33
+ import 'iudex.da.ContentMapper'
34
+
35
+ module Filters
36
+ import 'iudex.da.filters.UpdateFilter'
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,48 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'iudex-da/config'
19
+ require 'active_record'
20
+ require 'hooker'
21
+
22
+ module Iudex::DA
23
+
24
+ def self.setup
25
+ log = RJack::SLF4J[ "iudex.da.ActiveRecord" ]
26
+ conf = Hooker.merge( [ :iudex, :connect_props ], CONFIG )
27
+ log.info { "Connecting: #{ conf.inspect }" }
28
+
29
+ ActiveRecord::Base.logger = log
30
+ ActiveRecord::Base.establish_connection( conf )
31
+ end
32
+
33
+ setup #FIXME: Require explicit setup for use?
34
+
35
+ def migrate( target_version = nil )
36
+ ActiveRecord::Migrator.migrate( File.join( LIB_DIR, '..', '..', 'db' ),
37
+ target_version )
38
+ #FIXME: Support additional migration directories?
39
+ end
40
+
41
+ module_function :migrate
42
+
43
+ class Url < ActiveRecord::Base
44
+ set_primary_key :uhash
45
+ set_inheritance_column :object_type # since "type" used already
46
+ end
47
+
48
+ end