wikipedia 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/wikipedia.rb +36 -4
  2. metadata +2 -2
data/lib/wikipedia.rb CHANGED
@@ -4,19 +4,33 @@ require 'rubygems'
4
4
  require 'bundler/setup'
5
5
 
6
6
  require 'open-uri'
7
+ require 'cgi'
8
+
7
9
  require 'htmlentities'
8
10
  require 'hpricot'
9
11
 
12
+ class String
13
+ def capitalize_every_word()
14
+ new_string = []
15
+ self.split(' ').each do |word|
16
+ new_string << word.capitalize
17
+ end
18
+ return new_string.join(' ')
19
+ end
20
+ def capitalize_every_word!()
21
+ self.replace( self.capitalize_every_word() )
22
+ end
23
+ end
24
+
10
25
  module Wikipedia
11
26
 
12
27
  URL = "http://%LANG%.wikipedia.org/w/api.php?action=parse&page="
13
28
 
14
- def self.article( n, lang = :es )
29
+ def self.article( n, lang = :en )
15
30
 
16
31
  texts = []
17
32
 
18
- raw_data = open( URL.gsub("%LANG%", lang.to_s)+n ).read()
19
- #raw_data = File.read('pareidolia').gsub("\n", "")
33
+ raw_data = open( URL.gsub("%LANG%", lang.to_s)+escape(n) ).read()
20
34
 
21
35
  he = HTMLEntities.new()
22
36
 
@@ -25,10 +39,28 @@ module Wikipedia
25
39
  raw_data = he.decode( he.decode( raw_data ) ).gsub("\n", "") # >:D
26
40
 
27
41
  Hpricot(raw_data).search('p').each do |ph|
28
- texts << ph.inner_text
42
+ texts << escape_text( ph.inner_text )
29
43
  end
30
44
 
31
45
  return texts
32
46
 
33
47
  end
48
+
49
+ def self.escape(s)
50
+
51
+ s.capitalize_every_word!
52
+
53
+ CGI.escape( s )
54
+
55
+ end
56
+
57
+ def self.escape_text(s)
58
+
59
+ # Hpricot's inner_text() does this already but we don't want the cite-notes stuff: [0], [1], etc.
60
+
61
+ { Regexp.new("\\[(.*)\\]") => '' }.each { |str, replace_with| s.gsub!( str, replace_with ) }
62
+
63
+ s
64
+
65
+ end
34
66
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -18,7 +18,7 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - lib/wikipedia.rb
21
- homepage: http://lomitologia.blogspot.com/2013/05/030513.html
21
+ homepage: http://github.com/matiasinsaurralde/wikipedia
22
22
  licenses: []
23
23
  post_install_message:
24
24
  rdoc_options: []