iudex-da 1.2.1-java → 1.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +30 -0
- data/Manifest.txt +10 -16
- data/bin/iudex-migrate +7 -3
- data/db/20111012173757_base.rb +117 -0
- data/db/{0070_add_created_at.rb → 20120930173600_uhash_collation_order.rb} +16 -10
- data/db/{simhash/0085_add_simhash_index.rb → index_next_visit/21500000000101_add_index_next_visit.rb} +5 -9
- data/db/{0081_remove_simhash_index.rb → simhash/21500000000001_add_simhash_index.rb} +5 -9
- data/lib/iudex-da.rb +3 -1
- data/lib/iudex-da/base.rb +1 -1
- data/lib/iudex-da/config.rb +3 -3
- data/lib/iudex-da/iudex-da-1.3.0.jar +0 -0
- data/lib/iudex-da/models.rb +66 -0
- data/lib/iudex-da/orm.rb +183 -0
- data/lib/iudex-da/work_poller.rb +307 -0
- data/pom.xml +2 -2
- data/test/setup.rb +7 -5
- data/test/test_migrate.rb +8 -22
- data/test/test_pool_factory.rb +24 -13
- data/test/test_url_model.rb +52 -0
- data/test/test_work_poller.rb +157 -0
- metadata +210 -185
- data/db/0010_base_urls.rb +0 -84
- data/db/0020_add_feed_metadata.rb +0 -37
- data/db/0021_more_feed_text.rb +0 -29
- data/db/0030_add_priority.rb +0 -28
- data/db/0040_add_visit_after.rb +0 -30
- data/db/0050_add_cache_location.rb +0 -32
- data/db/0060_url_indexes.rb +0 -41
- data/db/0080_add_simhash.rb +0 -33
- data/db/0110_host_to_domain.rb +0 -36
- data/db/index_next_visit/0100_add_index_next_visit.rb +0 -27
- data/lib/iudex-da/ar.rb +0 -66
- data/lib/iudex-da/iudex-da-1.2.1.jar +0 -0
- data/test/test_poll_work.rb +0 -132
Binary file
|
data/test/test_poll_work.rb
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
#!/usr/bin/env jruby
|
2
|
-
#.hashdot.profile += jruby-shortlived
|
3
|
-
|
4
|
-
#--
|
5
|
-
# Copyright (c) 2008-2012 David Kellum
|
6
|
-
#
|
7
|
-
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
-
# may not use this file except in compliance with the License. You
|
9
|
-
# may obtain a copy of the License at
|
10
|
-
#
|
11
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
-
#
|
13
|
-
# Unless required by applicable law or agreed to in writing, software
|
14
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
-
# implied. See the License for the specific language governing
|
17
|
-
# permissions and limitations under the License.
|
18
|
-
#++
|
19
|
-
|
20
|
-
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
-
|
22
|
-
require 'iudex-core'
|
23
|
-
require 'iudex-da/ar'
|
24
|
-
|
25
|
-
class TestPollWork < MiniTest::Unit::TestCase
|
26
|
-
include Iudex::DA
|
27
|
-
import 'iudex.core.VisitURL'
|
28
|
-
|
29
|
-
def setup
|
30
|
-
Url.delete_all
|
31
|
-
|
32
|
-
domains = [ 'foo.org', 'other.net', 'gravitext.com', 'one.at' ]
|
33
|
-
count = 0
|
34
|
-
domains.each do |domain|
|
35
|
-
(5..15).each do |val|
|
36
|
-
url = Url.create! do |u|
|
37
|
-
u.priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
38
|
-
vurl = VisitURL.normalize( "http://#{domain}/#{u.priority}" )
|
39
|
-
u.type = "FEED"
|
40
|
-
u.domain = vurl.domain
|
41
|
-
u.url = vurl.to_s
|
42
|
-
u.uhash = vurl.uhash
|
43
|
-
u.next_visit_after = Time.now
|
44
|
-
count += 1
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def teardown
|
51
|
-
Url.delete_all
|
52
|
-
end
|
53
|
-
|
54
|
-
# Query to get new work, with limits on work per domain, and total
|
55
|
-
# work (in descending piority order)
|
56
|
-
def test_poll
|
57
|
-
query = <<END
|
58
|
-
SELECT url, domain, type, priority
|
59
|
-
FROM ( SELECT *, row_number() OVER ( ORDER BY priority DESC ) as ppos
|
60
|
-
FROM ( SELECT *, row_number() OVER ( PARTITION BY domain
|
61
|
-
ORDER BY priority DESC ) AS hpos
|
62
|
-
FROM urls
|
63
|
-
WHERE next_visit_after <= now() ) AS subh
|
64
|
-
WHERE hpos <= ? ) AS subp
|
65
|
-
WHERE ppos <= ?
|
66
|
-
ORDER BY domain, priority DESC;
|
67
|
-
END
|
68
|
-
res = Url.find_by_sql( [ query, 5, 18 ] )
|
69
|
-
|
70
|
-
def check_domain_subset( bydomain )
|
71
|
-
assert( bydomain.length <= 5 )
|
72
|
-
bydomain.each_cons(2) { |p,n| assert( p.priority >= n.priority ) }
|
73
|
-
end
|
74
|
-
|
75
|
-
assert( res.length <= 18 )
|
76
|
-
bydomain = []
|
77
|
-
res.each do |u|
|
78
|
-
if bydomain.empty? || bydomain.last.domain == u.domain
|
79
|
-
bydomain << u
|
80
|
-
else
|
81
|
-
check_domain_subset( bydomain )
|
82
|
-
bydomain = []
|
83
|
-
end
|
84
|
-
end
|
85
|
-
check_domain_subset( bydomain ) unless bydomain.empty?
|
86
|
-
|
87
|
-
end
|
88
|
-
|
89
|
-
def test_insert
|
90
|
-
|
91
|
-
Url.transaction do
|
92
|
-
sql = <<END
|
93
|
-
CREATE TEMPORARY TABLE mod_urls
|
94
|
-
( uhash text,
|
95
|
-
url text,
|
96
|
-
domain text );
|
97
|
-
END
|
98
|
-
# ON COMMIT DROP;
|
99
|
-
|
100
|
-
Url.connection.execute( sql ) #FIXME: auto-commit mode?
|
101
|
-
|
102
|
-
# Url.set_table_name "mod_urls"
|
103
|
-
|
104
|
-
count = ( 11 * 2 )
|
105
|
-
(5..20).each do |val|
|
106
|
-
# url = Url.create! do |u|
|
107
|
-
priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
108
|
-
# u.priority =
|
109
|
-
# u.type = "FEEDX"
|
110
|
-
vurl = VisitURL.normalize( "http://gravitext.com/#{priority}" )
|
111
|
-
|
112
|
-
sql = "INSERT into mod_urls VALUES ('%s','%s','%s')" %
|
113
|
-
[ vurl.uhash, vurl.to_s, vurl.domain ]
|
114
|
-
Url.connection.execute( sql )
|
115
|
-
# u.next_visit_after = Time.now
|
116
|
-
count += 1
|
117
|
-
end
|
118
|
-
insert_query = <<END
|
119
|
-
INSERT INTO urls (uhash,url,domain,type,priority)
|
120
|
-
( SELECT uhash,url,domain,'FEEDX',4.78 FROM mod_urls
|
121
|
-
WHERE uhash NOT IN ( SELECT uhash FROM urls ) );
|
122
|
-
END
|
123
|
-
Url.connection.execute( insert_query )
|
124
|
-
|
125
|
-
Url.connection.execute( "DROP TABLE mod_urls;" )
|
126
|
-
|
127
|
-
# Url.set_table_name "urls"
|
128
|
-
end
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|