wikipedia_parser 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/wikiParser.rb +68 -0
- data/lib/wikiParserPage.rb +98 -0
- data/lib/wikipedia_parser.rb +1 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzAxOTllY2I0ZmY0NTI1ZjFiMDUzN2EzNmJiMzVmYzljZDE1MDY3Nw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MTFiOTI2NzNiOWI0MDEyOTkwODllZTQwYmNhYjAzN2Q0YWJhZWIxMw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODI1YjBmMGIwOTYyZDlmNDYwMmNjMTNjOTgyZGMyMTY5ZTdhM2ZmMGEyZTY4
|
10
|
+
NjU1YTVkNDU4OTRkOGMzMzk1ZDA5NmFjNTgwYjIzNmZiMmY0Y2Y1NDk5YWU1
|
11
|
+
YzVkZjllZGU0NDE1MDU0NGI3NGJkZjI5MTU5ZWYyOGNmZWQzZjc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
Y2M5YzYwNGNhODA4MWY5YzdjMDc4Y2FjODJiMTgyYjMxZDAwYTE0NGU1NTlm
|
14
|
+
Y2IyMTkyMzQ2NDYyNDllNzY0ZmYxOWE3ODRhMzk3M2M1ZmRmOTMxZGFkY2Zh
|
15
|
+
YjdjYzM3ZmI3NWI2MDU2NDdlZjE4YTVlZGM2ZGEyMTM5ZTFlMjM=
|
data/lib/wikiParser.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'fileutils'
|
4
|
+
require(File.dirname(__FILE__)+'/wikiParserPage.rb')
|
5
|
+
|
6
|
+
# Parses a Wikipedia dump and extracts internal links, content, and page type.
|
7
|
+
class WikiParser
|
8
|
+
|
9
|
+
# path to the Wikipedia dump.
|
10
|
+
attr_reader :path
|
11
|
+
|
12
|
+
# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
|
13
|
+
# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
|
14
|
+
def parse
|
15
|
+
@xml_file = File.open(@path)
|
16
|
+
@file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
|
17
|
+
@reader = @file.to_enum
|
18
|
+
end
|
19
|
+
|
20
|
+
# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
|
21
|
+
# @param opts [Hash] the parameters to parse a wikipedia page.
|
22
|
+
# @option opts [String] :path The path to the Wikipedia dump in .xml or .bz2 format.
|
23
|
+
# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
|
24
|
+
def initialize (opts = {})
|
25
|
+
@file, new_path = nil, opts[:path]
|
26
|
+
if File.exists? new_path and !File.directory? new_path
|
27
|
+
@path = new_path
|
28
|
+
parse
|
29
|
+
else
|
30
|
+
raise ArgumentError.new "Cannot open file. Check path please."
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Closes the file reader.
|
35
|
+
def close; @xml_file.close if @xml_file; end
|
36
|
+
|
37
|
+
# Skips a {WikiParser::Page} in the enumeration
|
38
|
+
def skip
|
39
|
+
begin
|
40
|
+
node = @reader.next
|
41
|
+
if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
42
|
+
else
|
43
|
+
skip
|
44
|
+
end
|
45
|
+
rescue StopIteration
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
|
51
|
+
# @return [WikiParser::Page, NilClass] A page if found.
|
52
|
+
# @param opts [Hash] the parameters to instantiate a page.
|
53
|
+
# @option opts [String] :until A node-name stopping point for the parsing. (Useful for not parsing an entire page until some property is checked.)
|
54
|
+
# @see Page#finish_processing
|
55
|
+
def get_next_page(opts={})
|
56
|
+
begin
|
57
|
+
node = @reader.next
|
58
|
+
if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
59
|
+
xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
|
60
|
+
return WikiParser::Page.new({:node => xml}.merge(opts))
|
61
|
+
else
|
62
|
+
get_next_page(opts)
|
63
|
+
end
|
64
|
+
rescue StopIteration, NoMethodError
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
class WikiParser
|
2
|
+
|
3
|
+
# A Wikipedia article page object.
|
4
|
+
class Page
|
5
|
+
|
6
|
+
# The Wikipedia namespaces for all special pages {WikiParser::Page#special_page}, {#page_type}.
|
7
|
+
Namespaces = %w(WP Help Talk User Template Wikipedia File Book Portal TimedText Module MediaWiki Special Media Category)
|
8
|
+
# Title of the Wikipedia article.
|
9
|
+
attr_reader :title
|
10
|
+
# The Wikipedia id of the article.
|
11
|
+
attr_reader :id
|
12
|
+
attr_reader :internal_links
|
13
|
+
# the content of the Wikipedia article
|
14
|
+
attr_reader :article
|
15
|
+
# is this page a redirection page?
|
16
|
+
attr_reader :redirect
|
17
|
+
# the title of the page this article redirects to.
|
18
|
+
attr_reader :redirect_title
|
19
|
+
# the wikipedia namespace for this page
|
20
|
+
attr_reader :page_type
|
21
|
+
# is this page `special`? Is it in the {Namespaces}?
|
22
|
+
attr_reader :special_page
|
23
|
+
attr_reader :disambiguation_page
|
24
|
+
|
25
|
+
# Create a new article page from an XML node.
|
26
|
+
# @param opts [Hash] the parameters to instantiate a page.
|
27
|
+
# @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
|
28
|
+
# @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
|
29
|
+
# @option opts [String] :until A node-name stopping point for the parsing.
|
30
|
+
def initialize (opts={})
|
31
|
+
@title = @article = @redirect_title = ""
|
32
|
+
@redirect = @special_page = @disambiguation_page = false
|
33
|
+
@internal_links, @page_type = [], nil
|
34
|
+
return unless !opts[:node].nil?
|
35
|
+
process_node opts
|
36
|
+
trigs = article_to_internal_links(@article)
|
37
|
+
@internal_links = trigs
|
38
|
+
end
|
39
|
+
|
40
|
+
def process_node(opts={})
|
41
|
+
opts[:node].element_children.each_with_index do |node,k|
|
42
|
+
if opts[:from] and k < opts[:from] then next end
|
43
|
+
case node.name
|
44
|
+
when 'id'
|
45
|
+
@id = node.content
|
46
|
+
when 'title'
|
47
|
+
@title = node.content
|
48
|
+
if @title.match(/(#{Namespaces.join("|")}):.+/i) then @special_page = true and @page_type = $1 end
|
49
|
+
if @title.match(/.+ \(disambiguation\)/i) then @disambiguation_page = true end
|
50
|
+
when 'redirect'
|
51
|
+
@redirect = true
|
52
|
+
@redirect_title = node["title"]
|
53
|
+
when 'revision'
|
54
|
+
node.element_children.each do |rev_node|
|
55
|
+
if rev_node.name == 'text'
|
56
|
+
@article = rev_node.content
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
if opts[:until] and opts[:until] == node.name
|
61
|
+
@stop_index = k
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Extracts internals links from a wikipedia article into an array of `uri`s and `title`s, starting
|
68
|
+
# from the stopping point given to the parser earlier.
|
69
|
+
# @return [WikiParser::Page] the parser.
|
70
|
+
def finish_processing
|
71
|
+
@stop_index||= 0
|
72
|
+
process_node :node => @node, :from => @stop_index
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Extracts internals links from a wikipedia article into an array of `uri`s and `title`s:
|
77
|
+
# @param article [String] the article content to extract links from.
|
78
|
+
# @return [Array<Hash>] the internal links in hash form.
|
79
|
+
def article_to_internal_links article
|
80
|
+
links = []
|
81
|
+
matches = article.scan(/\[\[(?<name>[^\]\|:]+)(?<trigger>\|[^\]]+)?\]\]/)
|
82
|
+
if matches
|
83
|
+
matches.each do |match|
|
84
|
+
name_match = match[0].strip.chomp.match(/^(?<name>[^#]+)(?<hashtag>#.+)?/)
|
85
|
+
link_match = match[1] ? match[1].strip.chomp.match(/^\|[\t\n\s\/]*(?<name>[^#]+)(?<hashtag>#.+)?/) : name_match
|
86
|
+
if name_match
|
87
|
+
name_match = name_match[:name].gsub('_', ' ')
|
88
|
+
link_match = link_match ? link_match[:name] : name_match
|
89
|
+
links << {:uri => name_match, :title => link_match}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
links
|
94
|
+
end
|
95
|
+
|
96
|
+
private :process_node
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require(File.dirname(__FILE__)+'/wikiParser.rb')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikipedia_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Raiman
|
@@ -58,7 +58,10 @@ email: jraiman@mit.edu
|
|
58
58
|
executables: []
|
59
59
|
extensions: []
|
60
60
|
extra_rdoc_files: []
|
61
|
-
files:
|
61
|
+
files:
|
62
|
+
- lib/wikiParser.rb
|
63
|
+
- lib/wikiParserPage.rb
|
64
|
+
- lib/wikipedia_parser.rb
|
62
65
|
homepage: http://github.org/JonathanRaiman/wikipedia_parser
|
63
66
|
licenses:
|
64
67
|
- MIT
|