wikipedia 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/wikipedia.rb +34 -0
  2. metadata +45 -0
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler/setup'
5
+
6
+ require 'open-uri'
7
+ require 'htmlentities'
8
+ require 'hpricot'
9
+
10
+ module Wikipedia
11
+
12
+ URL = "http://%LANG%.wikipedia.org/w/api.php?action=parse&page="
13
+
14
+ def self.article( n, lang = :es )
15
+
16
+ texts = []
17
+
18
+ raw_data = open( URL.gsub("%LANG%", lang.to_s)+n ).read()
19
+ #raw_data = File.read('pareidolia').gsub("\n", "")
20
+
21
+ he = HTMLEntities.new()
22
+
23
+ # characters = { Regexp.new("\\[(.*)\\]") => '' }
24
+
25
+ raw_data = he.decode( he.decode( raw_data ) ).gsub("\n", "") # >:D
26
+
27
+ Hpricot(raw_data).search('p').each do |ph|
28
+ texts << ph.inner_text
29
+ end
30
+
31
+ return texts
32
+
33
+ end
34
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wikipedia
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Matias Insaurralde
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-05-03 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: tool for extracting plain text from wikipedia articles
15
+ email: matiasbaruchinsaurralde@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/wikipedia.rb
21
+ homepage: http://lomitologia.blogspot.com/2013/05/030513.html
22
+ licenses: []
23
+ post_install_message:
24
+ rdoc_options: []
25
+ require_paths:
26
+ - lib
27
+ required_ruby_version: !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 1.8.23
42
+ signing_key:
43
+ specification_version: 3
44
+ summary: tool for extracting plain text from wikipedia articles
45
+ test_files: []