wikipedia 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/wikipedia.rb +36 -4
  2. metadata +2 -2
data/lib/wikipedia.rb CHANGED
@@ -4,19 +4,33 @@ require 'rubygems'
4
4
  require 'bundler/setup'
5
5
 
6
6
  require 'open-uri'
7
+ require 'cgi'
8
+
7
9
  require 'htmlentities'
8
10
  require 'hpricot'
9
11
 
12
+ class String
13
+ def capitalize_every_word()
14
+ new_string = []
15
+ self.split(' ').each do |word|
16
+ new_string << word.capitalize
17
+ end
18
+ return new_string.join(' ')
19
+ end
20
+ def capitalize_every_word!()
21
+ self.replace( self.capitalize_every_word() )
22
+ end
23
+ end
24
+
10
25
  module Wikipedia
11
26
 
12
27
  URL = "http://%LANG%.wikipedia.org/w/api.php?action=parse&page="
13
28
 
14
- def self.article( n, lang = :es )
29
+ def self.article( n, lang = :en )
15
30
 
16
31
  texts = []
17
32
 
18
- raw_data = open( URL.gsub("%LANG%", lang.to_s)+n ).read()
19
- #raw_data = File.read('pareidolia').gsub("\n", "")
33
+ raw_data = open( URL.gsub("%LANG%", lang.to_s)+escape(n) ).read()
20
34
 
21
35
  he = HTMLEntities.new()
22
36
 
@@ -25,10 +39,28 @@ module Wikipedia
25
39
  raw_data = he.decode( he.decode( raw_data ) ).gsub("\n", "") # >:D
26
40
 
27
41
  Hpricot(raw_data).search('p').each do |ph|
28
- texts << ph.inner_text
42
+ texts << escape_text( ph.inner_text )
29
43
  end
30
44
 
31
45
  return texts
32
46
 
33
47
  end
48
+
49
+ def self.escape(s)
50
+
51
+ s.capitalize_every_word!
52
+
53
+ CGI.escape( s )
54
+
55
+ end
56
+
57
+ def self.escape_text(s)
58
+
59
+ # Hpricot's inner_text() does this already but we don't want the cite-notes stuff: [0], [1], etc.
60
+
61
+ { Regexp.new("\\[(.*)\\]") => '' }.each { |str, replace_with| s.gsub!( str, replace_with ) }
62
+
63
+ s
64
+
65
+ end
34
66
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -18,7 +18,7 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - lib/wikipedia.rb
21
- homepage: http://lomitologia.blogspot.com/2013/05/030513.html
21
+ homepage: http://github.com/matiasinsaurralde/wikipedia
22
22
  licenses: []
23
23
  post_install_message:
24
24
  rdoc_options: []