iudex-da 1.2.1-java → 1.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/db/0010_base_urls.rb DELETED
@@ -1,84 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- # http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
18
- # http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
19
-
20
- # Base Urls table schema
21
- class BaseUrls < ActiveRecord::Migration
22
- def self.up
23
- create_table( 'urls', :id => false ) {}
24
-
25
- add_column( 'urls', 'uhash', :text, :null => false )
26
- # 23 byte ASCII PRIMARY KEY SHA-1 hash fragment of URL
27
- # (Note :limit not useful.)
28
-
29
- add_column( 'urls', 'url', :text, :null => false )
30
- # Complete normalized url (exactly as used for uhash)
31
-
32
- add_column( 'urls', 'host', :text, :null => false )
33
- # Normalized host portion of URL
34
-
35
- add_column( 'urls', 'type', :text, :null => false )
36
- # FEED, PAGE, ROBOTS, SITEMAP
37
- # Potentially speculative (i.e. "PAGE" before visited)
38
- # FIXME: Or REDIRECT here instead of status?
39
-
40
- add_column( 'urls', 'etag', :text )
41
- # HTTP ETag header used for subsequent conditional GET
42
- # Should only be on 200 and related HTTP status, not redirect
43
-
44
- add_column( 'urls', 'last_visit', 'timestamp with time zone' )
45
- # Time of last visit (and thus last type,status,reason,etc.)
46
-
47
- add_column( 'urls', 'status', :integer )
48
- # HTTP status code or special (negative) status mapping
49
- # null : Not yet visited
50
- # -1 : Connection Failed
51
- # 4xx : Permanent Failures
52
- # 5xx : Transient server error
53
- # 200 : Success
54
- # 304 : Not Modified
55
- # 301,302 : Redirect
56
- # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
57
- # Compare to: http://crawler.archive.org/articles/user_manual/glossary.html#statuscodes
58
-
59
- add_column( 'urls', 'pass', :boolean )
60
- # null : Not yet processed (i.e. visit failed)
61
- # false : Rejected by processing (for reason), DELETE required
62
- # true : Fully Processed
63
-
64
- add_column( 'urls', 'reason', :text )
65
- # null : None
66
- # DUPE : Duplicate of referent
67
- # rejection filter (intended as key)
68
-
69
- add_column( 'urls', 'referent', :text )
70
- # null : None
71
- # uhash of url this is refering to
72
- # (includes status:REDIRECT, reason:DUPE, etc.)
73
-
74
- add_column( 'urls', 'referer', :text )
75
- # null : None
76
- # uhash of url this was refered from. (i.e. the feed URL)
77
-
78
- execute( "ALTER TABLE urls ADD PRIMARY KEY (uhash)" )
79
- end
80
-
81
- def self.down
82
- drop_table 'urls'
83
- end
84
- end
@@ -1,37 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddFeedMetadata < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'title', :text )
21
- # PAGE,FEED title
22
-
23
- add_column( 'urls', 'ref_pub_date', 'timestamp with time zone' )
24
- # (Latest) published date as provided from feed (may be ahead of
25
- # or set before pub_date, below).
26
-
27
- add_column( 'urls', 'pub_date', 'timestamp with time zone' )
28
- # (Latest) published date as processed
29
- end
30
-
31
- def self.down
32
- remove_column( 'urls', 'title' )
33
- remove_column( 'urls', 'ref_pub_date' )
34
- remove_column( 'urls', 'pub_date' )
35
- end
36
-
37
- end
@@ -1,29 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class MoreFeedText < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'summary', :text )
21
- add_column( 'urls', 'content', :text )
22
- end
23
-
24
- def self.down
25
- remove_column( 'urls', 'summary' )
26
- remove_column( 'urls', 'content' )
27
- end
28
-
29
- end
@@ -1,28 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddPriority < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'priority', 'real', :null => false, :default => 0.0 )
21
- # Prioritization of next visit, range -INF,+INF
22
- end
23
-
24
- def self.down
25
- remove_column( 'urls', 'priority' )
26
- end
27
-
28
- end
@@ -1,30 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddVisitAfter < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'next_visit_after', 'timestamp with time zone' )
21
- execute 'ALTER TABLE urls ALTER COLUMN next_visit_after SET DEFAULT now()'
22
- # null: never visit (terminal result)
23
- # Don't visit again before the specified date.
24
- end
25
-
26
- def self.down
27
- remove_column( 'urls', 'next_visit_after' )
28
- end
29
-
30
- end
@@ -1,32 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddCacheLocation < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'cache_file', :integer, :limit => 4 )
21
- # 32-bit file number
22
-
23
- add_column( 'urls', 'cache_file_offset', :integer, :limit => 8 )
24
- # 64-bit byte offset within file
25
- end
26
-
27
- def self.down
28
- remove_column 'urls', 'cache_file'
29
- remove_column 'urls', 'cache_file_offset'
30
- end
31
-
32
- end
@@ -1,41 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- # http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
18
- # http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
19
-
20
- # Indexes for urls table
21
- class UrlIndexes < ActiveRecord::Migration
22
- def self.up
23
- # FIXME: Disabled for now. Which are helpful?
24
-
25
- # add_index( 'urls', [ 'host' ] )
26
- # Used by (obsolesced) LIMIT/sub-query based work poll
27
-
28
- # add_index( 'urls', [ 'priority' ] )
29
- # FIXME: Consider partial index, e.g. WHERE next_visit_after IS NOT NULL?
30
- # FIXME: Consider a combined index 'host', 'priority'?
31
-
32
- # add_index( 'urls', [ 'next_visit_after' ] )
33
- # Used by (obsolesced) LIMIT/sub-query based work poll
34
- end
35
-
36
- def self.down
37
- # remove_index( 'urls', 'host' )
38
- # remove_index( 'urls', 'priority' )
39
- # remove_index( 'urls', 'next_visit_after' )
40
- end
41
- end
@@ -1,33 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddSimhash < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'simhash', :integer, :limit => 8 )
21
- # A simhash signature as a signed 8-byte long (should be
22
- # compatible with java long).
23
-
24
- add_index( 'urls', [ 'simhash' ] )
25
- # And its index
26
- end
27
-
28
- def self.down
29
- remove_index( 'urls', 'simhash' )
30
- remove_column( 'urls', 'simhash' )
31
- end
32
-
33
- end
@@ -1,36 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class HostToDomain < ActiveRecord::Migration
18
-
19
- def self.up
20
- # Move host to domain in place. This is intended to be the
21
- # normalized registration level domain as provided by
22
- # VisitURL.domain. Existing host values will often not match the
23
- # RL domain, but the usage as a WorkPoller grouping does not
24
- # strictly require this. Furthermore it would be very costly to
25
- # rewrite a large database to correct domain values. Start with a
26
- # clean or custom migrated database if this consistency is
27
- # important for your needs.
28
- rename_column( 'urls', 'host', 'domain' )
29
- #Equiv: add_column( 'urls', 'domain', :text, :null => false )
30
- end
31
-
32
- def self.down
33
- rename_column( 'urls', 'domain', 'host' )
34
- end
35
-
36
- end
@@ -1,27 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddIndexNextVisit < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_index( 'urls', 'next_visit_after' )
21
- end
22
-
23
- def self.down
24
- remove_index( 'urls', 'next_visit_after' )
25
- end
26
-
27
- end
data/lib/iudex-da/ar.rb DELETED
@@ -1,66 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- require 'rjack-slf4j'
18
- require 'iudex-da/config'
19
- require 'active_record'
20
- require 'hooker'
21
-
22
- module Iudex::DA
23
-
24
- def self.setup
25
- log = RJack::SLF4J[ "iudex.da.ActiveRecord" ]
26
- conf = Hooker.merge( [ :iudex, :connect_props ], CONFIG )
27
- log.info { "Connecting: #{ conf.inspect }" }
28
-
29
- ActiveRecord::Base.logger = log
30
- ActiveRecord::Base.establish_connection( conf )
31
- end
32
-
33
- setup #FIXME: Require explicit setup for use?
34
-
35
- def migrate( target_version = nil )
36
- base = File.join( LIB_DIR, '..', '..', 'db' )
37
- paths = [ base ]
38
-
39
- profiles = Hooker.apply( [ :iudex, :migration_profiles ], [] )
40
-
41
- paths += profiles.compact.map do |p|
42
- p = p.to_s
43
- if p =~ %r{^/}
44
- p
45
- else
46
- File.join( base, p )
47
- end
48
- end
49
-
50
- pattern = if paths.size > 1
51
- '{' + paths.join( ',' ) + '}'
52
- else
53
- paths.first
54
- end
55
-
56
- ActiveRecord::Migrator.migrate( pattern, target_version )
57
- end
58
-
59
- module_function :migrate
60
-
61
- class Url < ActiveRecord::Base
62
- set_primary_key :uhash
63
- set_inheritance_column :object_type # since "type" used already
64
- end
65
-
66
- end