wikipedia_parser 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/wikiParser.rb +22 -3
- data/lib/wikiParserPage.rb +4 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MDg2OTllOGVkNTYyYWM3ZDI1NzhmMmUzZjQyZGQ2ODQyMzNmY2FmOQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZmEwOTlmMTg4NTU5OGIwMjY4ZDBkNGYxMjQ0YzYxOGQzMzFlZmJjOA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODRhMTk3YTg1ZjZjOTllMmM3NmNlOTcxMTgwZDNjNDg4YTY3YmI1MjA1YTk2
|
10
|
+
MGMxMmJiYzFjZTRmYTI4MzlkODI3ZDYzMzE5ODgzNTdiZDhmYWYzMGJiY2Vk
|
11
|
+
OTMzZWQyZGNiNmIxOTUwZDMyYTljMzM4NjNjMGEzM2MxMGQ4Yjc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NWU4YzNhNTUwNjVkMWE3YTU4Yjk1ZjNjN2I4YTRjMzljMGNiMjZlMWJlNGNi
|
14
|
+
NTZjOTVhZTc4NjUzNTQyOWY5NDczOGY0OTVjMDAxY2NlMThiYWU4ZGI0MDcy
|
15
|
+
ZjAxOTE1OTYzNWNjNjcxNzgxMDYyMjc4ZjU2NDEyOGFlMWM3ZDg=
|
data/lib/wikiParser.rb
CHANGED
@@ -5,13 +5,16 @@ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
|
|
5
5
|
|
6
6
|
# Parses a Wikipedia dump and extracts internal links, content, and page type.
|
7
7
|
class WikiParser
|
8
|
+
LanguageNodePropertyName = "xml:lang"
|
8
9
|
|
9
10
|
# path to the Wikipedia dump.
|
10
11
|
attr_reader :path
|
12
|
+
# Language of the dump (e.g: "en","fr","ru",etc..)
|
13
|
+
attr_reader :language
|
11
14
|
|
12
15
|
# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
|
13
16
|
# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
|
14
|
-
def
|
17
|
+
def prepare_enumerator
|
15
18
|
@xml_file = File.open(@path)
|
16
19
|
@file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
|
17
20
|
@reader = @file.to_enum
|
@@ -25,7 +28,8 @@ class WikiParser
|
|
25
28
|
@file, new_path = nil, opts[:path]
|
26
29
|
if File.exists? new_path and !File.directory? new_path
|
27
30
|
@path = new_path
|
28
|
-
|
31
|
+
prepare_enumerator
|
32
|
+
get_language
|
29
33
|
else
|
30
34
|
raise ArgumentError.new "Cannot open file. Check path please."
|
31
35
|
end
|
@@ -47,6 +51,21 @@ class WikiParser
|
|
47
51
|
end
|
48
52
|
end
|
49
53
|
|
54
|
+
# Obtains the language by reading the 'xml:lang' property in the xml of the dump.
|
55
|
+
# @return [String] the language of the dump.
|
56
|
+
def get_language
|
57
|
+
begin
|
58
|
+
node = @reader.next
|
59
|
+
if node.name == "mediawiki" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
60
|
+
@language = node.attribute(LanguageNodePropertyName)
|
61
|
+
else
|
62
|
+
get_language
|
63
|
+
end
|
64
|
+
rescue StopIteration, NoMethodError
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
50
69
|
# Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
|
51
70
|
# @return [WikiParser::Page, NilClass] A page if found.
|
52
71
|
# @param opts [Hash] the parameters to instantiate a page.
|
@@ -57,7 +76,7 @@ class WikiParser
|
|
57
76
|
node = @reader.next
|
58
77
|
if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
59
78
|
xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
|
60
|
-
return WikiParser::Page.new({:node => xml}.merge(opts))
|
79
|
+
return WikiParser::Page.new({:node => xml, :language => @language}.merge(opts))
|
61
80
|
else
|
62
81
|
get_next_page(opts)
|
63
82
|
end
|
data/lib/wikiParserPage.rb
CHANGED
@@ -21,13 +21,16 @@ class WikiParser
|
|
21
21
|
# is this page `special`? Is it in the {Namespaces}?
|
22
22
|
attr_reader :special_page
|
23
23
|
attr_reader :disambiguation_page
|
24
|
+
attr_reader :language
|
24
25
|
|
25
26
|
# Create a new article page from an XML node.
|
26
27
|
# @param opts [Hash] the parameters to instantiate a page.
|
27
28
|
# @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
|
28
29
|
# @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
|
29
30
|
# @option opts [String] :until A node-name stopping point for the parsing.
|
31
|
+
# @option opts [String] :language The language of the dump this article was read from.
|
30
32
|
def initialize (opts={})
|
33
|
+
@language = opts[:language]
|
31
34
|
@title = @article = @redirect_title = ""
|
32
35
|
@redirect = @special_page = @disambiguation_page = false
|
33
36
|
@internal_links, @page_type = [], nil
|
@@ -89,7 +92,7 @@ class WikiParser
|
|
89
92
|
if name_match
|
90
93
|
name_match = name_match[:name].gsub('_', ' ')
|
91
94
|
link_match = link_match ? link_match[:name] : name_match
|
92
|
-
links << {:uri => name_match, :title => link_match}
|
95
|
+
links << {:uri => name_match, :title => {@language => link_match}}
|
93
96
|
end
|
94
97
|
end
|
95
98
|
end
|