wikipedia_parser 1.0.3 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/wikiParser.rb +68 -0
- data/lib/wikiParserPage.rb +98 -0
- data/lib/wikipedia_parser.rb +1 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzAxOTllY2I0ZmY0NTI1ZjFiMDUzN2EzNmJiMzVmYzljZDE1MDY3Nw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MTFiOTI2NzNiOWI0MDEyOTkwODllZTQwYmNhYjAzN2Q0YWJhZWIxMw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODI1YjBmMGIwOTYyZDlmNDYwMmNjMTNjOTgyZGMyMTY5ZTdhM2ZmMGEyZTY4
|
10
|
+
NjU1YTVkNDU4OTRkOGMzMzk1ZDA5NmFjNTgwYjIzNmZiMmY0Y2Y1NDk5YWU1
|
11
|
+
YzVkZjllZGU0NDE1MDU0NGI3NGJkZjI5MTU5ZWYyOGNmZWQzZjc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
Y2M5YzYwNGNhODA4MWY5YzdjMDc4Y2FjODJiMTgyYjMxZDAwYTE0NGU1NTlm
|
14
|
+
Y2IyMTkyMzQ2NDYyNDllNzY0ZmYxOWE3ODRhMzk3M2M1ZmRmOTMxZGFkY2Zh
|
15
|
+
YjdjYzM3ZmI3NWI2MDU2NDdlZjE4YTVlZGM2ZGEyMTM5ZTFlMjM=
|
data/lib/wikiParser.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'fileutils'
|
4
|
+
require(File.dirname(__FILE__)+'/wikiParserPage.rb')
|
5
|
+
|
6
|
+
# Parses a Wikipedia dump and extracts internal links, content, and page type.
|
7
|
+
class WikiParser
|
8
|
+
|
9
|
+
# path to the Wikipedia dump.
|
10
|
+
attr_reader :path
|
11
|
+
|
12
|
+
# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
|
13
|
+
# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
|
14
|
+
def parse
|
15
|
+
@xml_file = File.open(@path)
|
16
|
+
@file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
|
17
|
+
@reader = @file.to_enum
|
18
|
+
end
|
19
|
+
|
20
|
+
# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
|
21
|
+
# @param opts [Hash] the parameters to parse a wikipedia page.
|
22
|
+
# @option opts [String] :path The path to the Wikipedia dump in .xml or .bz2 format.
|
23
|
+
# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
|
24
|
+
def initialize (opts = {})
|
25
|
+
@file, new_path = nil, opts[:path]
|
26
|
+
if File.exists? new_path and !File.directory? new_path
|
27
|
+
@path = new_path
|
28
|
+
parse
|
29
|
+
else
|
30
|
+
raise ArgumentError.new "Cannot open file. Check path please."
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Closes the file reader.
|
35
|
+
def close; @xml_file.close if @xml_file; end
|
36
|
+
|
37
|
+
# Skips a {WikiParser::Page} in the enumeration
|
38
|
+
def skip
|
39
|
+
begin
|
40
|
+
node = @reader.next
|
41
|
+
if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
42
|
+
else
|
43
|
+
skip
|
44
|
+
end
|
45
|
+
rescue StopIteration
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
|
51
|
+
# @return [WikiParser::Page, NilClass] A page if found.
|
52
|
+
# @param opts [Hash] the parameters to instantiate a page.
|
53
|
+
# @option opts [String] :until A node-name stopping point for the parsing. (Useful for not parsing an entire page until some property is checked.)
|
54
|
+
# @see Page#finish_processing
|
55
|
+
def get_next_page(opts={})
|
56
|
+
begin
|
57
|
+
node = @reader.next
|
58
|
+
if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
59
|
+
xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
|
60
|
+
return WikiParser::Page.new({:node => xml}.merge(opts))
|
61
|
+
else
|
62
|
+
get_next_page(opts)
|
63
|
+
end
|
64
|
+
rescue StopIteration, NoMethodError
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
class WikiParser
|
2
|
+
|
3
|
+
# A Wikipedia article page object.
|
4
|
+
class Page
|
5
|
+
|
6
|
+
# The Wikipedia namespaces for all special pages {WikiParser::Page#special_page}, {#page_type}.
|
7
|
+
Namespaces = %w(WP Help Talk User Template Wikipedia File Book Portal TimedText Module MediaWiki Special Media Category)
|
8
|
+
# Title of the Wikipedia article.
|
9
|
+
attr_reader :title
|
10
|
+
# The Wikipedia id of the article.
|
11
|
+
attr_reader :id
|
12
|
+
attr_reader :internal_links
|
13
|
+
# the content of the Wikipedia article
|
14
|
+
attr_reader :article
|
15
|
+
# is this page a redirection page?
|
16
|
+
attr_reader :redirect
|
17
|
+
# the title of the page this article redirects to.
|
18
|
+
attr_reader :redirect_title
|
19
|
+
# the wikipedia namespace for this page
|
20
|
+
attr_reader :page_type
|
21
|
+
# is this page `special`? Is it in the {Namespaces}?
|
22
|
+
attr_reader :special_page
|
23
|
+
attr_reader :disambiguation_page
|
24
|
+
|
25
|
+
# Create a new article page from an XML node.
|
26
|
+
# @param opts [Hash] the parameters to instantiate a page.
|
27
|
+
# @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
|
28
|
+
# @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
|
29
|
+
# @option opts [String] :until A node-name stopping point for the parsing.
|
30
|
+
def initialize (opts={})
|
31
|
+
@title = @article = @redirect_title = ""
|
32
|
+
@redirect = @special_page = @disambiguation_page = false
|
33
|
+
@internal_links, @page_type = [], nil
|
34
|
+
return unless !opts[:node].nil?
|
35
|
+
process_node opts
|
36
|
+
trigs = article_to_internal_links(@article)
|
37
|
+
@internal_links = trigs
|
38
|
+
end
|
39
|
+
|
40
|
+
def process_node(opts={})
|
41
|
+
opts[:node].element_children.each_with_index do |node,k|
|
42
|
+
if opts[:from] and k < opts[:from] then next end
|
43
|
+
case node.name
|
44
|
+
when 'id'
|
45
|
+
@id = node.content
|
46
|
+
when 'title'
|
47
|
+
@title = node.content
|
48
|
+
if @title.match(/(#{Namespaces.join("|")}):.+/i) then @special_page = true and @page_type = $1 end
|
49
|
+
if @title.match(/.+ \(disambiguation\)/i) then @disambiguation_page = true end
|
50
|
+
when 'redirect'
|
51
|
+
@redirect = true
|
52
|
+
@redirect_title = node["title"]
|
53
|
+
when 'revision'
|
54
|
+
node.element_children.each do |rev_node|
|
55
|
+
if rev_node.name == 'text'
|
56
|
+
@article = rev_node.content
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
if opts[:until] and opts[:until] == node.name
|
61
|
+
@stop_index = k
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Extracts internals links from a wikipedia article into an array of `uri`s and `title`s, starting
|
68
|
+
# from the stopping point given to the parser earlier.
|
69
|
+
# @return [WikiParser::Page] the parser.
|
70
|
+
def finish_processing
|
71
|
+
@stop_index||= 0
|
72
|
+
process_node :node => @node, :from => @stop_index
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Extracts internals links from a wikipedia article into an array of `uri`s and `title`s:
|
77
|
+
# @param article [String] the article content to extract links from.
|
78
|
+
# @return [Array<Hash>] the internal links in hash form.
|
79
|
+
def article_to_internal_links article
|
80
|
+
links = []
|
81
|
+
matches = article.scan(/\[\[(?<name>[^\]\|:]+)(?<trigger>\|[^\]]+)?\]\]/)
|
82
|
+
if matches
|
83
|
+
matches.each do |match|
|
84
|
+
name_match = match[0].strip.chomp.match(/^(?<name>[^#]+)(?<hashtag>#.+)?/)
|
85
|
+
link_match = match[1] ? match[1].strip.chomp.match(/^\|[\t\n\s\/]*(?<name>[^#]+)(?<hashtag>#.+)?/) : name_match
|
86
|
+
if name_match
|
87
|
+
name_match = name_match[:name].gsub('_', ' ')
|
88
|
+
link_match = link_match ? link_match[:name] : name_match
|
89
|
+
links << {:uri => name_match, :title => link_match}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
links
|
94
|
+
end
|
95
|
+
|
96
|
+
private :process_node
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require(File.dirname(__FILE__)+'/wikiParser.rb')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikipedia_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Raiman
|
@@ -58,7 +58,10 @@ email: jraiman@mit.edu
|
|
58
58
|
executables: []
|
59
59
|
extensions: []
|
60
60
|
extra_rdoc_files: []
|
61
|
-
files:
|
61
|
+
files:
|
62
|
+
- lib/wikiParser.rb
|
63
|
+
- lib/wikiParserPage.rb
|
64
|
+
- lib/wikipedia_parser.rb
|
62
65
|
homepage: http://github.org/JonathanRaiman/wikipedia_parser
|
63
66
|
licenses:
|
64
67
|
- MIT
|