wikipedia_parser 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/wikiParser.rb +22 -3
- data/lib/wikiParserPage.rb +4 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MDg2OTllOGVkNTYyYWM3ZDI1NzhmMmUzZjQyZGQ2ODQyMzNmY2FmOQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZmEwOTlmMTg4NTU5OGIwMjY4ZDBkNGYxMjQ0YzYxOGQzMzFlZmJjOA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODRhMTk3YTg1ZjZjOTllMmM3NmNlOTcxMTgwZDNjNDg4YTY3YmI1MjA1YTk2
|
10
|
+
MGMxMmJiYzFjZTRmYTI4MzlkODI3ZDYzMzE5ODgzNTdiZDhmYWYzMGJiY2Vk
|
11
|
+
OTMzZWQyZGNiNmIxOTUwZDMyYTljMzM4NjNjMGEzM2MxMGQ4Yjc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NWU4YzNhNTUwNjVkMWE3YTU4Yjk1ZjNjN2I4YTRjMzljMGNiMjZlMWJlNGNi
|
14
|
+
NTZjOTVhZTc4NjUzNTQyOWY5NDczOGY0OTVjMDAxY2NlMThiYWU4ZGI0MDcy
|
15
|
+
ZjAxOTE1OTYzNWNjNjcxNzgxMDYyMjc4ZjU2NDEyOGFlMWM3ZDg=
|
data/lib/wikiParser.rb
CHANGED
@@ -5,13 +5,16 @@ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
|
|
5
5
|
|
6
6
|
# Parses a Wikipedia dump and extracts internal links, content, and page type.
|
7
7
|
class WikiParser
|
8
|
+
LanguageNodePropertyName = "xml:lang"
|
8
9
|
|
9
10
|
# path to the Wikipedia dump.
|
10
11
|
attr_reader :path
|
12
|
+
# Language of the dump (e.g: "en","fr","ru",etc..)
|
13
|
+
attr_reader :language
|
11
14
|
|
12
15
|
# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
|
13
16
|
# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
|
14
|
-
def
|
17
|
+
def prepare_enumerator
|
15
18
|
@xml_file = File.open(@path)
|
16
19
|
@file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
|
17
20
|
@reader = @file.to_enum
|
@@ -25,7 +28,8 @@ class WikiParser
|
|
25
28
|
@file, new_path = nil, opts[:path]
|
26
29
|
if File.exists? new_path and !File.directory? new_path
|
27
30
|
@path = new_path
|
28
|
-
|
31
|
+
prepare_enumerator
|
32
|
+
get_language
|
29
33
|
else
|
30
34
|
raise ArgumentError.new "Cannot open file. Check path please."
|
31
35
|
end
|
@@ -47,6 +51,21 @@ class WikiParser
|
|
47
51
|
end
|
48
52
|
end
|
49
53
|
|
54
|
+
# Obtains the language by reading the 'xml:lang' property in the xml of the dump.
|
55
|
+
# @return [String] the language of the dump.
|
56
|
+
def get_language
|
57
|
+
begin
|
58
|
+
node = @reader.next
|
59
|
+
if node.name == "mediawiki" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
60
|
+
@language = node.attribute(LanguageNodePropertyName)
|
61
|
+
else
|
62
|
+
get_language
|
63
|
+
end
|
64
|
+
rescue StopIteration, NoMethodError
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
50
69
|
# Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
|
51
70
|
# @return [WikiParser::Page, NilClass] A page if found.
|
52
71
|
# @param opts [Hash] the parameters to instantiate a page.
|
@@ -57,7 +76,7 @@ class WikiParser
|
|
57
76
|
node = @reader.next
|
58
77
|
if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
59
78
|
xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
|
60
|
-
return WikiParser::Page.new({:node => xml}.merge(opts))
|
79
|
+
return WikiParser::Page.new({:node => xml, :language => @language}.merge(opts))
|
61
80
|
else
|
62
81
|
get_next_page(opts)
|
63
82
|
end
|
data/lib/wikiParserPage.rb
CHANGED
@@ -21,13 +21,16 @@ class WikiParser
|
|
21
21
|
# is this page `special`? Is it in the {Namespaces}?
|
22
22
|
attr_reader :special_page
|
23
23
|
attr_reader :disambiguation_page
|
24
|
+
attr_reader :language
|
24
25
|
|
25
26
|
# Create a new article page from an XML node.
|
26
27
|
# @param opts [Hash] the parameters to instantiate a page.
|
27
28
|
# @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
|
28
29
|
# @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
|
29
30
|
# @option opts [String] :until A node-name stopping point for the parsing.
|
31
|
+
# @option opts [String] :language The language of the dump this article was read from.
|
30
32
|
def initialize (opts={})
|
33
|
+
@language = opts[:language]
|
31
34
|
@title = @article = @redirect_title = ""
|
32
35
|
@redirect = @special_page = @disambiguation_page = false
|
33
36
|
@internal_links, @page_type = [], nil
|
@@ -89,7 +92,7 @@ class WikiParser
|
|
89
92
|
if name_match
|
90
93
|
name_match = name_match[:name].gsub('_', ' ')
|
91
94
|
link_match = link_match ? link_match[:name] : name_match
|
92
|
-
links << {:uri => name_match, :title => link_match}
|
95
|
+
links << {:uri => name_match, :title => {@language => link_match}}
|
93
96
|
end
|
94
97
|
end
|
95
98
|
end
|