iudex-da 1.2.1-java → 1.3.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +30 -0
- data/Manifest.txt +10 -16
- data/bin/iudex-migrate +7 -3
- data/db/20111012173757_base.rb +117 -0
- data/db/{0070_add_created_at.rb → 20120930173600_uhash_collation_order.rb} +16 -10
- data/db/{simhash/0085_add_simhash_index.rb → index_next_visit/21500000000101_add_index_next_visit.rb} +5 -9
- data/db/{0081_remove_simhash_index.rb → simhash/21500000000001_add_simhash_index.rb} +5 -9
- data/lib/iudex-da.rb +3 -1
- data/lib/iudex-da/base.rb +1 -1
- data/lib/iudex-da/config.rb +3 -3
- data/lib/iudex-da/iudex-da-1.3.0.jar +0 -0
- data/lib/iudex-da/models.rb +66 -0
- data/lib/iudex-da/orm.rb +183 -0
- data/lib/iudex-da/work_poller.rb +307 -0
- data/pom.xml +2 -2
- data/test/setup.rb +7 -5
- data/test/test_migrate.rb +8 -22
- data/test/test_pool_factory.rb +24 -13
- data/test/test_url_model.rb +52 -0
- data/test/test_work_poller.rb +157 -0
- metadata +210 -185
- data/db/0010_base_urls.rb +0 -84
- data/db/0020_add_feed_metadata.rb +0 -37
- data/db/0021_more_feed_text.rb +0 -29
- data/db/0030_add_priority.rb +0 -28
- data/db/0040_add_visit_after.rb +0 -30
- data/db/0050_add_cache_location.rb +0 -32
- data/db/0060_url_indexes.rb +0 -41
- data/db/0080_add_simhash.rb +0 -33
- data/db/0110_host_to_domain.rb +0 -36
- data/db/index_next_visit/0100_add_index_next_visit.rb +0 -27
- data/lib/iudex-da/ar.rb +0 -66
- data/lib/iudex-da/iudex-da-1.2.1.jar +0 -0
- data/test/test_poll_work.rb +0 -132
Binary file
|
data/test/test_poll_work.rb
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
#!/usr/bin/env jruby
|
2
|
-
#.hashdot.profile += jruby-shortlived
|
3
|
-
|
4
|
-
#--
|
5
|
-
# Copyright (c) 2008-2012 David Kellum
|
6
|
-
#
|
7
|
-
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
-
# may not use this file except in compliance with the License. You
|
9
|
-
# may obtain a copy of the License at
|
10
|
-
#
|
11
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
-
#
|
13
|
-
# Unless required by applicable law or agreed to in writing, software
|
14
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
-
# implied. See the License for the specific language governing
|
17
|
-
# permissions and limitations under the License.
|
18
|
-
#++
|
19
|
-
|
20
|
-
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
-
|
22
|
-
require 'iudex-core'
|
23
|
-
require 'iudex-da/ar'
|
24
|
-
|
25
|
-
class TestPollWork < MiniTest::Unit::TestCase
|
26
|
-
include Iudex::DA
|
27
|
-
import 'iudex.core.VisitURL'
|
28
|
-
|
29
|
-
def setup
|
30
|
-
Url.delete_all
|
31
|
-
|
32
|
-
domains = [ 'foo.org', 'other.net', 'gravitext.com', 'one.at' ]
|
33
|
-
count = 0
|
34
|
-
domains.each do |domain|
|
35
|
-
(5..15).each do |val|
|
36
|
-
url = Url.create! do |u|
|
37
|
-
u.priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
38
|
-
vurl = VisitURL.normalize( "http://#{domain}/#{u.priority}" )
|
39
|
-
u.type = "FEED"
|
40
|
-
u.domain = vurl.domain
|
41
|
-
u.url = vurl.to_s
|
42
|
-
u.uhash = vurl.uhash
|
43
|
-
u.next_visit_after = Time.now
|
44
|
-
count += 1
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def teardown
|
51
|
-
Url.delete_all
|
52
|
-
end
|
53
|
-
|
54
|
-
# Query to get new work, with limits on work per domain, and total
|
55
|
-
# work (in descending piority order)
|
56
|
-
def test_poll
|
57
|
-
query = <<END
|
58
|
-
SELECT url, domain, type, priority
|
59
|
-
FROM ( SELECT *, row_number() OVER ( ORDER BY priority DESC ) as ppos
|
60
|
-
FROM ( SELECT *, row_number() OVER ( PARTITION BY domain
|
61
|
-
ORDER BY priority DESC ) AS hpos
|
62
|
-
FROM urls
|
63
|
-
WHERE next_visit_after <= now() ) AS subh
|
64
|
-
WHERE hpos <= ? ) AS subp
|
65
|
-
WHERE ppos <= ?
|
66
|
-
ORDER BY domain, priority DESC;
|
67
|
-
END
|
68
|
-
res = Url.find_by_sql( [ query, 5, 18 ] )
|
69
|
-
|
70
|
-
def check_domain_subset( bydomain )
|
71
|
-
assert( bydomain.length <= 5 )
|
72
|
-
bydomain.each_cons(2) { |p,n| assert( p.priority >= n.priority ) }
|
73
|
-
end
|
74
|
-
|
75
|
-
assert( res.length <= 18 )
|
76
|
-
bydomain = []
|
77
|
-
res.each do |u|
|
78
|
-
if bydomain.empty? || bydomain.last.domain == u.domain
|
79
|
-
bydomain << u
|
80
|
-
else
|
81
|
-
check_domain_subset( bydomain )
|
82
|
-
bydomain = []
|
83
|
-
end
|
84
|
-
end
|
85
|
-
check_domain_subset( bydomain ) unless bydomain.empty?
|
86
|
-
|
87
|
-
end
|
88
|
-
|
89
|
-
def test_insert
|
90
|
-
|
91
|
-
Url.transaction do
|
92
|
-
sql = <<END
|
93
|
-
CREATE TEMPORARY TABLE mod_urls
|
94
|
-
( uhash text,
|
95
|
-
url text,
|
96
|
-
domain text );
|
97
|
-
END
|
98
|
-
# ON COMMIT DROP;
|
99
|
-
|
100
|
-
Url.connection.execute( sql ) #FIXME: auto-commit mode?
|
101
|
-
|
102
|
-
# Url.set_table_name "mod_urls"
|
103
|
-
|
104
|
-
count = ( 11 * 2 )
|
105
|
-
(5..20).each do |val|
|
106
|
-
# url = Url.create! do |u|
|
107
|
-
priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
108
|
-
# u.priority =
|
109
|
-
# u.type = "FEEDX"
|
110
|
-
vurl = VisitURL.normalize( "http://gravitext.com/#{priority}" )
|
111
|
-
|
112
|
-
sql = "INSERT into mod_urls VALUES ('%s','%s','%s')" %
|
113
|
-
[ vurl.uhash, vurl.to_s, vurl.domain ]
|
114
|
-
Url.connection.execute( sql )
|
115
|
-
# u.next_visit_after = Time.now
|
116
|
-
count += 1
|
117
|
-
end
|
118
|
-
insert_query = <<END
|
119
|
-
INSERT INTO urls (uhash,url,domain,type,priority)
|
120
|
-
( SELECT uhash,url,domain,'FEEDX',4.78 FROM mod_urls
|
121
|
-
WHERE uhash NOT IN ( SELECT uhash FROM urls ) );
|
122
|
-
END
|
123
|
-
Url.connection.execute( insert_query )
|
124
|
-
|
125
|
-
Url.connection.execute( "DROP TABLE mod_urls;" )
|
126
|
-
|
127
|
-
# Url.set_table_name "urls"
|
128
|
-
end
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|