iudex-da 1.2.1-java → 1.3.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/db/0010_base_urls.rb DELETED
@@ -1,84 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- # http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
18
- # http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
19
-
20
- # Base Urls table schema
21
- class BaseUrls < ActiveRecord::Migration
22
- def self.up
23
- create_table( 'urls', :id => false ) {}
24
-
25
- add_column( 'urls', 'uhash', :text, :null => false )
26
- # 23 byte ASCII PRIMARY KEY SHA-1 hash fragment of URL
27
- # (Note :limit not useful.)
28
-
29
- add_column( 'urls', 'url', :text, :null => false )
30
- # Complete normalized url (exactly as used for uhash)
31
-
32
- add_column( 'urls', 'host', :text, :null => false )
33
- # Normalized host portion of URL
34
-
35
- add_column( 'urls', 'type', :text, :null => false )
36
- # FEED, PAGE, ROBOTS, SITEMAP
37
- # Potentially speculative (i.e. "PAGE" before visited)
38
- # FIXME: Or REDIRECT here instead of status?
39
-
40
- add_column( 'urls', 'etag', :text )
41
- # HTTP ETag header used for subsequent conditional GET
42
- # Should only be on 200 and related HTTP status, not redirect
43
-
44
- add_column( 'urls', 'last_visit', 'timestamp with time zone' )
45
- # Time of last visit (and thus last type,status,reason,etc.)
46
-
47
- add_column( 'urls', 'status', :integer )
48
- # HTTP status code or special (negative) status mapping
49
- # null : Not yet visited
50
- # -1 : Connection Failed
51
- # 4xx : Permanent Failures
52
- # 5xx : Transient server error
53
- # 200 : Success
54
- # 304 : Not Modified
55
- # 301,302 : Redirect
56
- # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
57
- # Compare to: http://crawler.archive.org/articles/user_manual/glossary.html#statuscodes
58
-
59
- add_column( 'urls', 'pass', :boolean )
60
- # null : Not yet processed (i.e. visit failed)
61
- # false : Rejected by processing (for reason), DELETE required
62
- # true : Fully Processed
63
-
64
- add_column( 'urls', 'reason', :text )
65
- # null : None
66
- # DUPE : Duplicate of referent
67
- # rejection filter (intended as key)
68
-
69
- add_column( 'urls', 'referent', :text )
70
- # null : None
71
- # uhash of url this is refering to
72
- # (includes status:REDIRECT, reason:DUPE, etc.)
73
-
74
- add_column( 'urls', 'referer', :text )
75
- # null : None
76
- # uhash of url this was refered from. (i.e. the feed URL)
77
-
78
- execute( "ALTER TABLE urls ADD PRIMARY KEY (uhash)" )
79
- end
80
-
81
- def self.down
82
- drop_table 'urls'
83
- end
84
- end
@@ -1,37 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddFeedMetadata < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'title', :text )
21
- # PAGE,FEED title
22
-
23
- add_column( 'urls', 'ref_pub_date', 'timestamp with time zone' )
24
- # (Latest) published date as provided from feed (may be ahead of
25
- # or set before pub_date, below).
26
-
27
- add_column( 'urls', 'pub_date', 'timestamp with time zone' )
28
- # (Latest) published date as processed
29
- end
30
-
31
- def self.down
32
- remove_column( 'urls', 'title' )
33
- remove_column( 'urls', 'ref_pub_date' )
34
- remove_column( 'urls', 'pub_date' )
35
- end
36
-
37
- end
@@ -1,29 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class MoreFeedText < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'summary', :text )
21
- add_column( 'urls', 'content', :text )
22
- end
23
-
24
- def self.down
25
- remove_column( 'urls', 'summary' )
26
- remove_column( 'urls', 'content' )
27
- end
28
-
29
- end
@@ -1,28 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddPriority < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'priority', 'real', :null => false, :default => 0.0 )
21
- # Prioritization of next visit, range -INF,+INF
22
- end
23
-
24
- def self.down
25
- remove_column( 'urls', 'priority' )
26
- end
27
-
28
- end
@@ -1,30 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddVisitAfter < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'next_visit_after', 'timestamp with time zone' )
21
- execute 'ALTER TABLE urls ALTER COLUMN next_visit_after SET DEFAULT now()'
22
- # null: never visit (terminal result)
23
- # Don't visit again before the specified date.
24
- end
25
-
26
- def self.down
27
- remove_column( 'urls', 'next_visit_after' )
28
- end
29
-
30
- end
@@ -1,32 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddCacheLocation < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'cache_file', :integer, :limit => 4 )
21
- # 32-bit file number
22
-
23
- add_column( 'urls', 'cache_file_offset', :integer, :limit => 8 )
24
- # 64-bit byte offset within file
25
- end
26
-
27
- def self.down
28
- remove_column 'urls', 'cache_file'
29
- remove_column 'urls', 'cache_file_offset'
30
- end
31
-
32
- end
@@ -1,41 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- # http://api.rubyonrails.org/classes/ActiveRecord/Migration.html
18
- # http://api.rubyonrails.org/classes/ActiveRecord/ConnectionAdapters/TableDefinition.html
19
-
20
- # Indexes for urls table
21
- class UrlIndexes < ActiveRecord::Migration
22
- def self.up
23
- # FIXME: Disabled for now. Which are helpful?
24
-
25
- # add_index( 'urls', [ 'host' ] )
26
- # Used by (obsolesced) LIMIT/sub-query based work poll
27
-
28
- # add_index( 'urls', [ 'priority' ] )
29
- # FIXME: Consider partial index, e.g. WHERE next_visit_after IS NOT NULL?
30
- # FIXME: Consider a combined index 'host', 'priority'?
31
-
32
- # add_index( 'urls', [ 'next_visit_after' ] )
33
- # Used by (obsolesced) LIMIT/sub-query based work poll
34
- end
35
-
36
- def self.down
37
- # remove_index( 'urls', 'host' )
38
- # remove_index( 'urls', 'priority' )
39
- # remove_index( 'urls', 'next_visit_after' )
40
- end
41
- end
@@ -1,33 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddSimhash < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_column( 'urls', 'simhash', :integer, :limit => 8 )
21
- # A simhash signature as a signed 8-byte long (should be
22
- # compatible with java long).
23
-
24
- add_index( 'urls', [ 'simhash' ] )
25
- # And its index
26
- end
27
-
28
- def self.down
29
- remove_index( 'urls', 'simhash' )
30
- remove_column( 'urls', 'simhash' )
31
- end
32
-
33
- end
@@ -1,36 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class HostToDomain < ActiveRecord::Migration
18
-
19
- def self.up
20
- # Move host to domain in place. This is intended to be the
21
- # normalized registration level domain as provided by
22
- # VisitURL.domain. Existing host values will often not match the
23
- # RL domain, but the usage as a WorkPoller grouping does not
24
- # strictly require this. Furthermore it would be very costly to
25
- # rewrite a large database to correct domain values. Start with a
26
- # clean or custom migrated database if this consistency is
27
- # important for your needs.
28
- rename_column( 'urls', 'host', 'domain' )
29
- #Equiv: add_column( 'urls', 'domain', :text, :null => false )
30
- end
31
-
32
- def self.down
33
- rename_column( 'urls', 'domain', 'host' )
34
- end
35
-
36
- end
@@ -1,27 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- class AddIndexNextVisit < ActiveRecord::Migration
18
-
19
- def self.up
20
- add_index( 'urls', 'next_visit_after' )
21
- end
22
-
23
- def self.down
24
- remove_index( 'urls', 'next_visit_after' )
25
- end
26
-
27
- end
data/lib/iudex-da/ar.rb DELETED
@@ -1,66 +0,0 @@
1
- #--
2
- # Copyright (c) 2008-2012 David Kellum
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License"); you
5
- # may not use this file except in compliance with the License. You
6
- # may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
- # implied. See the License for the specific language governing
14
- # permissions and limitations under the License.
15
- #++
16
-
17
- require 'rjack-slf4j'
18
- require 'iudex-da/config'
19
- require 'active_record'
20
- require 'hooker'
21
-
22
- module Iudex::DA
23
-
24
- def self.setup
25
- log = RJack::SLF4J[ "iudex.da.ActiveRecord" ]
26
- conf = Hooker.merge( [ :iudex, :connect_props ], CONFIG )
27
- log.info { "Connecting: #{ conf.inspect }" }
28
-
29
- ActiveRecord::Base.logger = log
30
- ActiveRecord::Base.establish_connection( conf )
31
- end
32
-
33
- setup #FIXME: Require explicit setup for use?
34
-
35
- def migrate( target_version = nil )
36
- base = File.join( LIB_DIR, '..', '..', 'db' )
37
- paths = [ base ]
38
-
39
- profiles = Hooker.apply( [ :iudex, :migration_profiles ], [] )
40
-
41
- paths += profiles.compact.map do |p|
42
- p = p.to_s
43
- if p =~ %r{^/}
44
- p
45
- else
46
- File.join( base, p )
47
- end
48
- end
49
-
50
- pattern = if paths.size > 1
51
- '{' + paths.join( ',' ) + '}'
52
- else
53
- paths.first
54
- end
55
-
56
- ActiveRecord::Migrator.migrate( pattern, target_version )
57
- end
58
-
59
- module_function :migrate
60
-
61
- class Url < ActiveRecord::Base
62
- set_primary_key :uhash
63
- set_inheritance_column :object_type # since "type" used already
64
- end
65
-
66
- end