wikipedia 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/wikipedia.rb +36 -4
- metadata +2 -2
data/lib/wikipedia.rb
CHANGED
@@ -4,19 +4,33 @@ require 'rubygems'
|
|
4
4
|
require 'bundler/setup'
|
5
5
|
|
6
6
|
require 'open-uri'
|
7
|
+
require 'cgi'
|
8
|
+
|
7
9
|
require 'htmlentities'
|
8
10
|
require 'hpricot'
|
9
11
|
|
12
|
+
class String
|
13
|
+
def capitalize_every_word()
|
14
|
+
new_string = []
|
15
|
+
self.split(' ').each do |word|
|
16
|
+
new_string << word.capitalize
|
17
|
+
end
|
18
|
+
return new_string.join(' ')
|
19
|
+
end
|
20
|
+
def capitalize_every_word!()
|
21
|
+
self.replace( self.capitalize_every_word() )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
10
25
|
module Wikipedia
|
11
26
|
|
12
27
|
URL = "http://%LANG%.wikipedia.org/w/api.php?action=parse&page="
|
13
28
|
|
14
|
-
def self.article( n, lang = :
|
29
|
+
def self.article( n, lang = :en )
|
15
30
|
|
16
31
|
texts = []
|
17
32
|
|
18
|
-
raw_data = open( URL.gsub("%LANG%", lang.to_s)+n ).read()
|
19
|
-
#raw_data = File.read('pareidolia').gsub("\n", "")
|
33
|
+
raw_data = open( URL.gsub("%LANG%", lang.to_s)+escape(n) ).read()
|
20
34
|
|
21
35
|
he = HTMLEntities.new()
|
22
36
|
|
@@ -25,10 +39,28 @@ module Wikipedia
|
|
25
39
|
raw_data = he.decode( he.decode( raw_data ) ).gsub("\n", "") # >:D
|
26
40
|
|
27
41
|
Hpricot(raw_data).search('p').each do |ph|
|
28
|
-
texts << ph.inner_text
|
42
|
+
texts << escape_text( ph.inner_text )
|
29
43
|
end
|
30
44
|
|
31
45
|
return texts
|
32
46
|
|
33
47
|
end
|
48
|
+
|
49
|
+
def self.escape(s)
|
50
|
+
|
51
|
+
s.capitalize_every_word!
|
52
|
+
|
53
|
+
CGI.escape( s )
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.escape_text(s)
|
58
|
+
|
59
|
+
# Hpricot's inner_text() does this already but we don't want the cite-notes stuff: [0], [1], etc.
|
60
|
+
|
61
|
+
{ Regexp.new("\\[(.*)\\]") => '' }.each { |str, replace_with| s.gsub!( str, replace_with ) }
|
62
|
+
|
63
|
+
s
|
64
|
+
|
65
|
+
end
|
34
66
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikipedia
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -18,7 +18,7 @@ extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
20
|
- lib/wikipedia.rb
|
21
|
-
homepage: http://
|
21
|
+
homepage: http://github.com/matiasinsaurralde/wikipedia
|
22
22
|
licenses: []
|
23
23
|
post_install_message:
|
24
24
|
rdoc_options: []
|