sunflower 0.3 → 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +1 -1
- data/lib/sunflower/commontasks.rb +265 -250
- data/lib/sunflower/core.rb +288 -287
- data/lib/sunflower/listmaker.rb +160 -152
- data/scripts/ZDBOT.rb +61 -61
- data/scripts/aktualizacjapilkarzy.rb +339 -339
- data/scripts/changeimage.rb +41 -41
- data/scripts/fix-bold-in-headers.rb +41 -53
- data/scripts/fix-double-pipes.rb +30 -49
- data/scripts/fix-langs.rb +42 -42
- data/scripts/fix-multiple-same-refs.rb +101 -101
- data/scripts/fix-some-entities.rb +36 -43
- data/scripts/fix-unicode-control-chars.rb +30 -51
- data/scripts/insight.rb +132 -132
- data/scripts/lekkoatl-portal.rb +50 -50
- data/scripts/make-id2team-list.rb +31 -31
- data/scripts/recat.rb +27 -32
- data/scripts/wanted.rb +72 -72
- metadata +40 -62
@@ -1,339 +1,339 @@
|
|
1
|
-
require 'orderedhash'
|
2
|
-
require 'hpricot'
|
3
|
-
require 'net/http'
|
4
|
-
require 'sunflower-core.rb'
|
5
|
-
require 'sunflower-listmaker.rb'
|
6
|
-
include Net
|
7
|
-
|
8
|
-
$datafile=File.open('aktual.txt','w')
|
9
|
-
$datafile.sync=true
|
10
|
-
|
11
|
-
id2team={}
|
12
|
-
begin
|
13
|
-
File.open('id2team.txt') do |f|
|
14
|
-
id2team.replace Hash[*f.read.strip.split(/\r?\n|\t/)]
|
15
|
-
end
|
16
|
-
rescue
|
17
|
-
end
|
18
|
-
|
19
|
-
# comes from http://rubyforge.org/frs/?group_id=6257&release_id=36721
|
20
|
-
module Levenshtein
|
21
|
-
VERSION = "0.2.0"
|
22
|
-
|
23
|
-
# Returns the Levenshtein distance as a number between 0.0 and
|
24
|
-
# 1.0. It's basically the Levenshtein distance divided by the
|
25
|
-
# length of the longest sequence.
|
26
|
-
|
27
|
-
def self.normalized_distance(s1, s2, threshold=nil)
|
28
|
-
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
29
|
-
|
30
|
-
if s2.length == 0
|
31
|
-
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
32
|
-
else
|
33
|
-
if threshold
|
34
|
-
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
|
35
|
-
d.to_f/s2.length
|
36
|
-
else
|
37
|
-
nil
|
38
|
-
end
|
39
|
-
else
|
40
|
-
self.distance(s1, s2).to_f/s2.length
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
# Returns the Levenshtein distance between two sequences.
|
46
|
-
#
|
47
|
-
# The two sequences can be two strings, two arrays, or two other
|
48
|
-
# objects. Strings, arrays and arrays of strings are handled with
|
49
|
-
# optimized (very fast) C code. All other sequences are handled
|
50
|
-
# with generic (fast) C code.
|
51
|
-
#
|
52
|
-
# The sequences should respond to :length and :[] and all objects
|
53
|
-
# in the sequences (as returned by []) should response to :==.
|
54
|
-
|
55
|
-
def self.distance(s1, s2, threshold=nil)
|
56
|
-
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
57
|
-
|
58
|
-
# Handle some basic circumstances.
|
59
|
-
|
60
|
-
return 0 if s1 == s2
|
61
|
-
return s2.length if s1.length == 0
|
62
|
-
|
63
|
-
if threshold
|
64
|
-
return nil if (s2.length-s1.length) >= threshold
|
65
|
-
|
66
|
-
a1, a2 = nil, nil
|
67
|
-
a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
|
68
|
-
a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
|
69
|
-
|
70
|
-
if a1 and a2
|
71
|
-
return nil if (a1-a2).length >= threshold
|
72
|
-
return nil if (a2-a1).length >= threshold
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
distance_fast_or_slow(s1, s2, threshold)
|
77
|
-
end
|
78
|
-
|
79
|
-
def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
|
80
|
-
if respond_to?(:levenshtein_distance_fast)
|
81
|
-
levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
|
82
|
-
else
|
83
|
-
levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
|
88
|
-
row = (0..s1.length).to_a
|
89
|
-
|
90
|
-
1.upto(s2.length) do |y|
|
91
|
-
prow = row
|
92
|
-
row = [y]
|
93
|
-
|
94
|
-
1.upto(s1.length) do |x|
|
95
|
-
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
96
|
-
end
|
97
|
-
|
98
|
-
# Stop analysing this sequence as soon as the best possible
|
99
|
-
# result for this sequence is bigger than the best result so far.
|
100
|
-
# (The minimum value in the next row will be equal to or greater
|
101
|
-
# than the minimum value in this row.)
|
102
|
-
|
103
|
-
return nil if threshold and row.min >= threshold
|
104
|
-
end
|
105
|
-
|
106
|
-
row[-1]
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
|
111
|
-
def puts *arg
|
112
|
-
arg.each{|str| $stdout.puts str; $datafile.puts str}
|
113
|
-
end
|
114
|
-
|
115
|
-
def saveData
|
116
|
-
=begin
|
117
|
-
File.open('aktualdata.txt','w'){|f|
|
118
|
-
f.write "
|
119
|
-
$notfound=#{$notfound.length}
|
120
|
-
$same=#{$same.length}
|
121
|
-
$diff=#{$diff.length}
|
122
|
-
----
|
123
|
-
$notfound:
|
124
|
-
# {$notfound.join "\n"}
|
125
|
-
----
|
126
|
-
$same:
|
127
|
-
# {$same.join "\n"}
|
128
|
-
----
|
129
|
-
$diff:
|
130
|
-
# {$diff.join "\n"}
|
131
|
-
"
|
132
|
-
}
|
133
|
-
=end
|
134
|
-
end
|
135
|
-
|
136
|
-
def get(url)
|
137
|
-
return HTTP.get(URI.parse(url))
|
138
|
-
end
|
139
|
-
|
140
|
-
def getPlayerData url
|
141
|
-
r=get url
|
142
|
-
r=~/<b>All time playing career<\/b>/
|
143
|
-
r=$'
|
144
|
-
r=~/<a name=games><\/a>/
|
145
|
-
table=$`.strip
|
146
|
-
|
147
|
-
h=Hpricot.parse table
|
148
|
-
rows=h.search 'tr+tr'
|
149
|
-
|
150
|
-
data={}
|
151
|
-
rows.each do |r|
|
152
|
-
if r.at('td')['colspan']==nil && (r.inner_html=~/No appearance data available/)==nil
|
153
|
-
cells=r.search 'td'
|
154
|
-
team=cells[0].search('font a')[0].inner_html.strip
|
155
|
-
teamid=cells[0].search('font a')[0]['href'].sub(/\A.+?(\d+)\Z/, '\1')
|
156
|
-
matches=cells[4].at('font').inner_html.split('(').map{|m| m.gsub(/[^0-9]/,'').to_i}
|
157
|
-
matches=matches[0]+matches[1]
|
158
|
-
goals=cells[5].at('font').inner_html.gsub(/[^0-9]/,'').to_i
|
159
|
-
|
160
|
-
data[team]=[matches,goals,teamid]
|
161
|
-
end
|
162
|
-
end
|
163
|
-
return data
|
164
|
-
end
|
165
|
-
|
166
|
-
def searchForPlayer text
|
167
|
-
d=get "http://www.soccerbase.com/search.sd?search_string=#{CGI.escape text}&search_cat=players"
|
168
|
-
d=~/window.location = "(http:[^"]+)"/
|
169
|
-
|
170
|
-
return $1
|
171
|
-
end
|
172
|
-
|
173
|
-
$edits=0
|
174
|
-
$summary='aktualizacja danych o meczach piłkarza'
|
175
|
-
|
176
|
-
puts 'Making list...'
|
177
|
-
s=Sunflower.new('pl.wikipedia.org')
|
178
|
-
s.login
|
179
|
-
enw=Sunflower.new('en.wikipedia.org')
|
180
|
-
enw.login
|
181
|
-
|
182
|
-
# list=(
|
183
|
-
# s.makeList('category-r', 'Kategoria:Piłkarze Aston Villa F.C.')+
|
184
|
-
# s.makeList('category-r', 'Kategoria:Piłkarze Chelsea F.C.')+
|
185
|
-
# s.makeList('category-r', 'Kategoria:Piłkarze Liverpool F.C.')
|
186
|
-
# ).uniq
|
187
|
-
# list=(
|
188
|
-
# s.makeList('category-r', 'Kategoria:Piłkarze angielskich klubów')+
|
189
|
-
# s.makeList('category-r', 'Kategoria:Piłkarze walijskich klubów')
|
190
|
-
# ).uniq
|
191
|
-
|
192
|
-
# list.delete_if{|i| i=~/^Kategoria:/}
|
193
|
-
|
194
|
-
# File.open('lista-pilkarze.txt','w').write list.join("\n")
|
195
|
-
# list=File.open('lista-pilkarze.txt').read.split(/\r?\n/)
|
196
|
-
list=['Wikipedysta:Matma Rex/brudnopis']
|
197
|
-
|
198
|
-
puts 'Done!'
|
199
|
-
puts ''
|
200
|
-
|
201
|
-
$notfound=[]
|
202
|
-
$same=[]
|
203
|
-
$diff=[]
|
204
|
-
|
205
|
-
list.each_with_index do |art, i|
|
206
|
-
exit if $edits>4
|
207
|
-
|
208
|
-
# finding data
|
209
|
-
puts "* [[#{art}]]"
|
210
|
-
pPl=Page.new(art, 'pl')
|
211
|
-
pPl.read=~/\[\[en:([^\]]+)\]\]/
|
212
|
-
if $1
|
213
|
-
artEn=$1
|
214
|
-
puts "** Interwiki-en: [[:en:#{artEn}]]"
|
215
|
-
else
|
216
|
-
artEn=art
|
217
|
-
puts "** No interwiki; guessing [[:en:#{art}]]"
|
218
|
-
end
|
219
|
-
|
220
|
-
pPl.read=~/\{\{soccerbase.*?(\d+).*?\}\}|soccerbase\.com\/players_details\.sd\?playerid=(\d+)/i
|
221
|
-
if $1||$2
|
222
|
-
soccid=$1||$2
|
223
|
-
url="http://www.soccerbase.com/players_details.sd?playerid=#{soccid}"
|
224
|
-
puts '** Found id on plwiki'
|
225
|
-
else
|
226
|
-
pEn=Page.new(art, 'en')
|
227
|
-
pEn.read=~/\{\{soccerbase.*?(\d+).*?\}\}|soccerbase\.com\/players_details\.sd\?playerid=(\d+)/i
|
228
|
-
if $1||$2
|
229
|
-
soccid=$1||$2
|
230
|
-
url="http://www.soccerbase.com/players_details.sd?playerid=#{soccid}"
|
231
|
-
puts '** Found id on enwiki'
|
232
|
-
else
|
233
|
-
url=searchForPlayer(art)||searchForPlayer(artEn)
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
if url==nil
|
238
|
-
puts '** Not found.'
|
239
|
-
$notfound<<art
|
240
|
-
else
|
241
|
-
data=getPlayerData url
|
242
|
-
puts "** URL: #{url}"
|
243
|
-
unless data.empty?
|
244
|
-
puts "** Found info on soccerbase."
|
245
|
-
else
|
246
|
-
puts '** Found, but no data.'
|
247
|
-
$notfound<<art
|
248
|
-
end
|
249
|
-
end
|
250
|
-
|
251
|
-
pPl.read =~ /występy\(gole\)\s*=(.+)/
|
252
|
-
if $1==nil
|
253
|
-
puts '** Wiki: error. No infobox?'
|
254
|
-
else
|
255
|
-
a=$1.split(/\s*<br.*?>\s*/)[-1].strip
|
256
|
-
a=~/(\d+)\s*\((\d+)\)/
|
257
|
-
matchesW, goalsW = $1.to_i, $2.to_i
|
258
|
-
puts "** Wiki info: #{matchesW} matches, #{goalsW} goals."
|
259
|
-
end
|
260
|
-
|
261
|
-
saveData if i%30==0 && i!=0
|
262
|
-
|
263
|
-
# $change=File.open('changelist.txt','w')
|
264
|
-
# $change.sync=true
|
265
|
-
|
266
|
-
# editing
|
267
|
-
if data
|
268
|
-
#$change.puts "* [[#{art}]] - #{matchesW}/#{goalsW} -> #{matches}/#{goals}"
|
269
|
-
|
270
|
-
pPl.text=~/(kluby\s*=\s*)([^\|]+)(\s*\|)/
|
271
|
-
kluby=$2
|
272
|
-
pPl.text=~/(występy\(gole\)\s*=\s*)([^\|]+)(\s*\|)/
|
273
|
-
wystepygole=$2
|
274
|
-
|
275
|
-
resolve={}
|
276
|
-
kluby=kluby.split(/<\/?br[^>]*>/).map do |i|
|
277
|
-
short=i.strip.gsub(/\[\[(?:[^\]\|]+\||)([^\]\|]+)\]\]/,'\1').gsub(/→|\(wyp\.\)/,'').strip
|
278
|
-
resolve[short]=i.strip
|
279
|
-
short
|
280
|
-
end
|
281
|
-
wystepygole=wystepygole.split(/<\/?br[^>]*?>/).map{|i| i.strip}
|
282
|
-
wystepygole.delete_if{|i| i==''}
|
283
|
-
kluby.delete_if{|i| i==''}
|
284
|
-
|
285
|
-
wystepygole.pop while wystepygole.length>kluby.length
|
286
|
-
wystepygole.push [0,0] while wystepygole.length<kluby.length
|
287
|
-
|
288
|
-
wikidata=OrderedHash.new
|
289
|
-
kluby.each_index do |i|
|
290
|
-
wystepygole[i]=~/(\d+)\s*\((\d+)\)/
|
291
|
-
wikidata[kluby[i]]=[$1.to_i, $2.to_i]
|
292
|
-
end
|
293
|
-
|
294
|
-
# puts data.inspect
|
295
|
-
# puts wikidata.inspect
|
296
|
-
|
297
|
-
data.each_pair do |scbclub, scb, teamid|
|
298
|
-
min=[999, 'null']
|
299
|
-
wikidata.each_pair do |wikiclub, wiki|
|
300
|
-
if wikiclub.index scbclub || scbclub.index wikiclub
|
301
|
-
min=[0, wikiclub]
|
302
|
-
break
|
303
|
-
end
|
304
|
-
if wikiclub.index id2team[teamid] || id2team[teamid].index wikiclub
|
305
|
-
min=[0, wikiclub]
|
306
|
-
break
|
307
|
-
end
|
308
|
-
|
309
|
-
d=Levenshtein.distance(scbclub, wikiclub)
|
310
|
-
min=[d, wikiclub] if d<min[0]
|
311
|
-
|
312
|
-
d=Levenshtein.distance(id2team[teamid], wikiclub)
|
313
|
-
min=[d, wikiclub] if d<min[0]
|
314
|
-
end
|
315
|
-
club=min[1]
|
316
|
-
|
317
|
-
wikidata[club]=data[scbclub]
|
318
|
-
end
|
319
|
-
|
320
|
-
infoboxwystepygole=[]
|
321
|
-
infoboxkluby=[]
|
322
|
-
|
323
|
-
wikidata.each do |club, info|
|
324
|
-
infoboxkluby<<resolve[club]
|
325
|
-
infoboxwystepygole<<"#{info[0]} (#{info[1]})"
|
326
|
-
end
|
327
|
-
|
328
|
-
infoboxkluby=infoboxkluby.join('<br />')
|
329
|
-
infoboxwystepygole=infoboxwystepygole.join('<br />')
|
330
|
-
|
331
|
-
pPl.text=pPl.text.sub(/(występy\(gole\)\s*=\s*)([^\|]+?)(\s*\|)/){$1+infoboxwystepygole+$3}
|
332
|
-
pPl.text=pPl.text.sub(/(kluby\s*=\s*)([^\|]+?)(\s*\|)/){$1+infoboxkluby+$3}
|
333
|
-
pPl.text=pPl.text.sub(/(data1\s*=\s*)([^\|]+?)(\s*\|)/, '\1{{subst:CURRENTDAY}} {{subst:CURRENTMONTHNAMEGEN}} {{subst:CURRENTYEAR}}\3')
|
334
|
-
|
335
|
-
$edits+=1
|
336
|
-
pPl.save
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
1
|
+
require 'orderedhash'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'net/http'
|
4
|
+
require 'sunflower-core.rb'
|
5
|
+
require 'sunflower-listmaker.rb'
|
6
|
+
include Net
|
7
|
+
|
8
|
+
$datafile=File.open('aktual.txt','w')
|
9
|
+
$datafile.sync=true
|
10
|
+
|
11
|
+
id2team={}
|
12
|
+
begin
|
13
|
+
File.open('id2team.txt') do |f|
|
14
|
+
id2team.replace Hash[*f.read.strip.split(/\r?\n|\t/)]
|
15
|
+
end
|
16
|
+
rescue
|
17
|
+
end
|
18
|
+
|
19
|
+
# comes from http://rubyforge.org/frs/?group_id=6257&release_id=36721
|
20
|
+
module Levenshtein
|
21
|
+
VERSION = "0.2.0"
|
22
|
+
|
23
|
+
# Returns the Levenshtein distance as a number between 0.0 and
|
24
|
+
# 1.0. It's basically the Levenshtein distance divided by the
|
25
|
+
# length of the longest sequence.
|
26
|
+
|
27
|
+
def self.normalized_distance(s1, s2, threshold=nil)
|
28
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
29
|
+
|
30
|
+
if s2.length == 0
|
31
|
+
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
32
|
+
else
|
33
|
+
if threshold
|
34
|
+
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
|
35
|
+
d.to_f/s2.length
|
36
|
+
else
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
else
|
40
|
+
self.distance(s1, s2).to_f/s2.length
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the Levenshtein distance between two sequences.
|
46
|
+
#
|
47
|
+
# The two sequences can be two strings, two arrays, or two other
|
48
|
+
# objects. Strings, arrays and arrays of strings are handled with
|
49
|
+
# optimized (very fast) C code. All other sequences are handled
|
50
|
+
# with generic (fast) C code.
|
51
|
+
#
|
52
|
+
# The sequences should respond to :length and :[] and all objects
|
53
|
+
# in the sequences (as returned by []) should response to :==.
|
54
|
+
|
55
|
+
def self.distance(s1, s2, threshold=nil)
|
56
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
57
|
+
|
58
|
+
# Handle some basic circumstances.
|
59
|
+
|
60
|
+
return 0 if s1 == s2
|
61
|
+
return s2.length if s1.length == 0
|
62
|
+
|
63
|
+
if threshold
|
64
|
+
return nil if (s2.length-s1.length) >= threshold
|
65
|
+
|
66
|
+
a1, a2 = nil, nil
|
67
|
+
a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
|
68
|
+
a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
|
69
|
+
|
70
|
+
if a1 and a2
|
71
|
+
return nil if (a1-a2).length >= threshold
|
72
|
+
return nil if (a2-a1).length >= threshold
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
distance_fast_or_slow(s1, s2, threshold)
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
|
80
|
+
if respond_to?(:levenshtein_distance_fast)
|
81
|
+
levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
|
82
|
+
else
|
83
|
+
levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
|
88
|
+
row = (0..s1.length).to_a
|
89
|
+
|
90
|
+
1.upto(s2.length) do |y|
|
91
|
+
prow = row
|
92
|
+
row = [y]
|
93
|
+
|
94
|
+
1.upto(s1.length) do |x|
|
95
|
+
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
96
|
+
end
|
97
|
+
|
98
|
+
# Stop analysing this sequence as soon as the best possible
|
99
|
+
# result for this sequence is bigger than the best result so far.
|
100
|
+
# (The minimum value in the next row will be equal to or greater
|
101
|
+
# than the minimum value in this row.)
|
102
|
+
|
103
|
+
return nil if threshold and row.min >= threshold
|
104
|
+
end
|
105
|
+
|
106
|
+
row[-1]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
def puts *arg
|
112
|
+
arg.each{|str| $stdout.puts str; $datafile.puts str}
|
113
|
+
end
|
114
|
+
|
115
|
+
def saveData
|
116
|
+
=begin
|
117
|
+
File.open('aktualdata.txt','w'){|f|
|
118
|
+
f.write "
|
119
|
+
$notfound=#{$notfound.length}
|
120
|
+
$same=#{$same.length}
|
121
|
+
$diff=#{$diff.length}
|
122
|
+
----
|
123
|
+
$notfound:
|
124
|
+
# {$notfound.join "\n"}
|
125
|
+
----
|
126
|
+
$same:
|
127
|
+
# {$same.join "\n"}
|
128
|
+
----
|
129
|
+
$diff:
|
130
|
+
# {$diff.join "\n"}
|
131
|
+
"
|
132
|
+
}
|
133
|
+
=end
|
134
|
+
end
|
135
|
+
|
136
|
+
def get(url)
|
137
|
+
return HTTP.get(URI.parse(url))
|
138
|
+
end
|
139
|
+
|
140
|
+
def getPlayerData url
|
141
|
+
r=get url
|
142
|
+
r=~/<b>All time playing career<\/b>/
|
143
|
+
r=$'
|
144
|
+
r=~/<a name=games><\/a>/
|
145
|
+
table=$`.strip
|
146
|
+
|
147
|
+
h=Hpricot.parse table
|
148
|
+
rows=h.search 'tr+tr'
|
149
|
+
|
150
|
+
data={}
|
151
|
+
rows.each do |r|
|
152
|
+
if r.at('td')['colspan']==nil && (r.inner_html=~/No appearance data available/)==nil
|
153
|
+
cells=r.search 'td'
|
154
|
+
team=cells[0].search('font a')[0].inner_html.strip
|
155
|
+
teamid=cells[0].search('font a')[0]['href'].sub(/\A.+?(\d+)\Z/, '\1')
|
156
|
+
matches=cells[4].at('font').inner_html.split('(').map{|m| m.gsub(/[^0-9]/,'').to_i}
|
157
|
+
matches=matches[0]+matches[1]
|
158
|
+
goals=cells[5].at('font').inner_html.gsub(/[^0-9]/,'').to_i
|
159
|
+
|
160
|
+
data[team]=[matches,goals,teamid]
|
161
|
+
end
|
162
|
+
end
|
163
|
+
return data
|
164
|
+
end
|
165
|
+
|
166
|
+
def searchForPlayer text
|
167
|
+
d=get "http://www.soccerbase.com/search.sd?search_string=#{CGI.escape text}&search_cat=players"
|
168
|
+
d=~/window.location = "(http:[^"]+)"/
|
169
|
+
|
170
|
+
return $1
|
171
|
+
end
|
172
|
+
|
173
|
+
$edits=0
|
174
|
+
$summary='aktualizacja danych o meczach piłkarza'
|
175
|
+
|
176
|
+
puts 'Making list...'
|
177
|
+
s=Sunflower.new('pl.wikipedia.org')
|
178
|
+
s.login
|
179
|
+
enw=Sunflower.new('en.wikipedia.org')
|
180
|
+
enw.login
|
181
|
+
|
182
|
+
# list=(
|
183
|
+
# s.makeList('category-r', 'Kategoria:Piłkarze Aston Villa F.C.')+
|
184
|
+
# s.makeList('category-r', 'Kategoria:Piłkarze Chelsea F.C.')+
|
185
|
+
# s.makeList('category-r', 'Kategoria:Piłkarze Liverpool F.C.')
|
186
|
+
# ).uniq
|
187
|
+
# list=(
|
188
|
+
# s.makeList('category-r', 'Kategoria:Piłkarze angielskich klubów')+
|
189
|
+
# s.makeList('category-r', 'Kategoria:Piłkarze walijskich klubów')
|
190
|
+
# ).uniq
|
191
|
+
|
192
|
+
# list.delete_if{|i| i=~/^Kategoria:/}
|
193
|
+
|
194
|
+
# File.open('lista-pilkarze.txt','w').write list.join("\n")
|
195
|
+
# list=File.open('lista-pilkarze.txt').read.split(/\r?\n/)
|
196
|
+
list=['Wikipedysta:Matma Rex/brudnopis']
|
197
|
+
|
198
|
+
puts 'Done!'
|
199
|
+
puts ''
|
200
|
+
|
201
|
+
$notfound=[]
|
202
|
+
$same=[]
|
203
|
+
$diff=[]
|
204
|
+
|
205
|
+
list.each_with_index do |art, i|
|
206
|
+
exit if $edits>4
|
207
|
+
|
208
|
+
# finding data
|
209
|
+
puts "* [[#{art}]]"
|
210
|
+
pPl=Page.new(art, 'pl')
|
211
|
+
pPl.read=~/\[\[en:([^\]]+)\]\]/
|
212
|
+
if $1
|
213
|
+
artEn=$1
|
214
|
+
puts "** Interwiki-en: [[:en:#{artEn}]]"
|
215
|
+
else
|
216
|
+
artEn=art
|
217
|
+
puts "** No interwiki; guessing [[:en:#{art}]]"
|
218
|
+
end
|
219
|
+
|
220
|
+
pPl.read=~/\{\{soccerbase.*?(\d+).*?\}\}|soccerbase\.com\/players_details\.sd\?playerid=(\d+)/i
|
221
|
+
if $1||$2
|
222
|
+
soccid=$1||$2
|
223
|
+
url="http://www.soccerbase.com/players_details.sd?playerid=#{soccid}"
|
224
|
+
puts '** Found id on plwiki'
|
225
|
+
else
|
226
|
+
pEn=Page.new(art, 'en')
|
227
|
+
pEn.read=~/\{\{soccerbase.*?(\d+).*?\}\}|soccerbase\.com\/players_details\.sd\?playerid=(\d+)/i
|
228
|
+
if $1||$2
|
229
|
+
soccid=$1||$2
|
230
|
+
url="http://www.soccerbase.com/players_details.sd?playerid=#{soccid}"
|
231
|
+
puts '** Found id on enwiki'
|
232
|
+
else
|
233
|
+
url=searchForPlayer(art)||searchForPlayer(artEn)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
if url==nil
|
238
|
+
puts '** Not found.'
|
239
|
+
$notfound<<art
|
240
|
+
else
|
241
|
+
data=getPlayerData url
|
242
|
+
puts "** URL: #{url}"
|
243
|
+
unless data.empty?
|
244
|
+
puts "** Found info on soccerbase."
|
245
|
+
else
|
246
|
+
puts '** Found, but no data.'
|
247
|
+
$notfound<<art
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
pPl.read =~ /występy\(gole\)\s*=(.+)/
|
252
|
+
if $1==nil
|
253
|
+
puts '** Wiki: error. No infobox?'
|
254
|
+
else
|
255
|
+
a=$1.split(/\s*<br.*?>\s*/)[-1].strip
|
256
|
+
a=~/(\d+)\s*\((\d+)\)/
|
257
|
+
matchesW, goalsW = $1.to_i, $2.to_i
|
258
|
+
puts "** Wiki info: #{matchesW} matches, #{goalsW} goals."
|
259
|
+
end
|
260
|
+
|
261
|
+
saveData if i%30==0 && i!=0
|
262
|
+
|
263
|
+
# $change=File.open('changelist.txt','w')
|
264
|
+
# $change.sync=true
|
265
|
+
|
266
|
+
# editing
|
267
|
+
if data
|
268
|
+
#$change.puts "* [[#{art}]] - #{matchesW}/#{goalsW} -> #{matches}/#{goals}"
|
269
|
+
|
270
|
+
pPl.text=~/(kluby\s*=\s*)([^\|]+)(\s*\|)/
|
271
|
+
kluby=$2
|
272
|
+
pPl.text=~/(występy\(gole\)\s*=\s*)([^\|]+)(\s*\|)/
|
273
|
+
wystepygole=$2
|
274
|
+
|
275
|
+
resolve={}
|
276
|
+
kluby=kluby.split(/<\/?br[^>]*>/).map do |i|
|
277
|
+
short=i.strip.gsub(/\[\[(?:[^\]\|]+\||)([^\]\|]+)\]\]/,'\1').gsub(/→|\(wyp\.\)/,'').strip
|
278
|
+
resolve[short]=i.strip
|
279
|
+
short
|
280
|
+
end
|
281
|
+
wystepygole=wystepygole.split(/<\/?br[^>]*?>/).map{|i| i.strip}
|
282
|
+
wystepygole.delete_if{|i| i==''}
|
283
|
+
kluby.delete_if{|i| i==''}
|
284
|
+
|
285
|
+
wystepygole.pop while wystepygole.length>kluby.length
|
286
|
+
wystepygole.push [0,0] while wystepygole.length<kluby.length
|
287
|
+
|
288
|
+
wikidata=OrderedHash.new
|
289
|
+
kluby.each_index do |i|
|
290
|
+
wystepygole[i]=~/(\d+)\s*\((\d+)\)/
|
291
|
+
wikidata[kluby[i]]=[$1.to_i, $2.to_i]
|
292
|
+
end
|
293
|
+
|
294
|
+
# puts data.inspect
|
295
|
+
# puts wikidata.inspect
|
296
|
+
|
297
|
+
data.each_pair do |scbclub, scb, teamid|
|
298
|
+
min=[999, 'null']
|
299
|
+
wikidata.each_pair do |wikiclub, wiki|
|
300
|
+
if wikiclub.index scbclub || scbclub.index wikiclub
|
301
|
+
min=[0, wikiclub]
|
302
|
+
break
|
303
|
+
end
|
304
|
+
if wikiclub.index id2team[teamid] || id2team[teamid].index wikiclub
|
305
|
+
min=[0, wikiclub]
|
306
|
+
break
|
307
|
+
end
|
308
|
+
|
309
|
+
d=Levenshtein.distance(scbclub, wikiclub)
|
310
|
+
min=[d, wikiclub] if d<min[0]
|
311
|
+
|
312
|
+
d=Levenshtein.distance(id2team[teamid], wikiclub)
|
313
|
+
min=[d, wikiclub] if d<min[0]
|
314
|
+
end
|
315
|
+
club=min[1]
|
316
|
+
|
317
|
+
wikidata[club]=data[scbclub]
|
318
|
+
end
|
319
|
+
|
320
|
+
infoboxwystepygole=[]
|
321
|
+
infoboxkluby=[]
|
322
|
+
|
323
|
+
wikidata.each do |club, info|
|
324
|
+
infoboxkluby<<resolve[club]
|
325
|
+
infoboxwystepygole<<"#{info[0]} (#{info[1]})"
|
326
|
+
end
|
327
|
+
|
328
|
+
infoboxkluby=infoboxkluby.join('<br />')
|
329
|
+
infoboxwystepygole=infoboxwystepygole.join('<br />')
|
330
|
+
|
331
|
+
pPl.text=pPl.text.sub(/(występy\(gole\)\s*=\s*)([^\|]+?)(\s*\|)/){$1+infoboxwystepygole+$3}
|
332
|
+
pPl.text=pPl.text.sub(/(kluby\s*=\s*)([^\|]+?)(\s*\|)/){$1+infoboxkluby+$3}
|
333
|
+
pPl.text=pPl.text.sub(/(data1\s*=\s*)([^\|]+?)(\s*\|)/, '\1{{subst:CURRENTDAY}} {{subst:CURRENTMONTHNAMEGEN}} {{subst:CURRENTYEAR}}\3')
|
334
|
+
|
335
|
+
$edits+=1
|
336
|
+
pPl.save
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|