RubyGems - wikipedia_parser - Versions diffs - 1.1.1 → 1.2.0 - Mend

wikipedia_parser 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    ZTQyNDgzOGQ4M2ZmYzljMjZhYWZmZjNlMmU2NmUyNGViNDk2MzYzYQ==
+    MDg2OTllOGVkNTYyYWM3ZDI1NzhmMmUzZjQyZGQ2ODQyMzNmY2FmOQ==
   data.tar.gz: !binary |-
-    ZTFmMmE5MmJjNDA0Y2YyN2RlOGY1Y2E3OTJmMGRmNDRhY2YxZDZiMA==
+    ZmEwOTlmMTg4NTU5OGIwMjY4ZDBkNGYxMjQ0YzYxOGQzMzFlZmJjOA==
 SHA512:
   metadata.gz: !binary |-
-    MGQwZWM1ZGFiMjA4ZDVhNzUzMmJlMDVhNDRjY2EyYTBjOGExNTQ4Y2ZlNmYx
-    NTJjMTQ0ODU0YWQwNmQ4YWQ1YmZiMGYzYTM4YTAxMDZiNTVmNjk2ZGU3YWZj
-    OThhZjgwNDI4YjJiOTJhODY4OTRlMTAzMTA0OWRjYTBmNWFkYjg=
+    ODRhMTk3YTg1ZjZjOTllMmM3NmNlOTcxMTgwZDNjNDg4YTY3YmI1MjA1YTk2
+    MGMxMmJiYzFjZTRmYTI4MzlkODI3ZDYzMzE5ODgzNTdiZDhmYWYzMGJiY2Vk
+    OTMzZWQyZGNiNmIxOTUwZDMyYTljMzM4NjNjMGEzM2MxMGQ4Yjc=
   data.tar.gz: !binary |-
-    NTFlM2M5MjJiNzRhMjdkYzUyMThiNzdmMzg3YzJjMWU2MTliNDM3MzlkMDk1
-    MzA1MjMwNjRmNzI3ZDk2MjcxMWI2ZDc0MGM1NzhlYmRjM2I5NTBkMTcyMzM0
-    NDk4ZTA1Njg1NzgwZDg5YTUxMTg3MzdiMjk3ZjAxYmVkZDcyYzE=
+    NWU4YzNhNTUwNjVkMWE3YTU4Yjk1ZjNjN2I4YTRjMzljMGNiMjZlMWJlNGNi
+    NTZjOTVhZTc4NjUzNTQyOWY5NDczOGY0OTVjMDAxY2NlMThiYWU4ZGI0MDcy
+    ZjAxOTE1OTYzNWNjNjcxNzgxMDYyMjc4ZjU2NDEyOGFlMWM3ZDg=

data/lib/wikiParser.rb CHANGED Viewed

@@ -5,13 +5,16 @@ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
 # Parses a Wikipedia dump and extracts internal links, content, and page type.
 class WikiParser
+	LanguageNodePropertyName = "xml:lang"
 	# path to the Wikipedia dump.
 	attr_reader :path
+	# Language of the dump (e.g: "en","fr","ru",etc..)
+	attr_reader :language
 	# Convert the opened path to a dump to an enumerator of {WikiParser::Page}
 	# @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
-	def parse
+	def prepare_enumerator
 		@xml_file = File.open(@path)
 		@file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
 		@reader = @file.to_enum
@@ -25,7 +28,8 @@ class WikiParser
 		@file, new_path = nil, opts[:path]
 		if File.exists? new_path and !File.directory? new_path
 			@path = new_path
-			parse
+			prepare_enumerator
+			get_language
 		else
 			raise ArgumentError.new "Cannot open file. Check path please."
 		end
@@ -47,6 +51,21 @@ class WikiParser
 		end
 	end
+	# Obtains the language by reading the 'xml:lang' property in the xml of the dump.
+	# @return [String] the language of the dump.
+	def get_language
+		begin
+			node = @reader.next
+			if node.name == "mediawiki" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
+				@language = node.attribute(LanguageNodePropertyName)
+			else
+				get_language
+			end
+		rescue StopIteration, NoMethodError
+			nil
+		end
+	end
 	# Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
 	# @return [WikiParser::Page, NilClass] A page if found.
 	# @param opts [Hash] the parameters to instantiate a page.
@@ -57,7 +76,7 @@ class WikiParser
 			node = @reader.next
 			if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
 				xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
-				return WikiParser::Page.new({:node => xml}.merge(opts))
+				return WikiParser::Page.new({:node => xml, :language => @language}.merge(opts))
 			else
 				get_next_page(opts)
 			end

data/lib/wikiParserPage.rb CHANGED Viewed

@@ -21,13 +21,16 @@ class WikiParser
 		# is this page `special`? Is it in the {Namespaces}?
 		attr_reader :special_page
 		attr_reader :disambiguation_page
+		attr_reader :language
 		# Create a new article page from an XML node.
 		# @param opts [Hash] the parameters to instantiate a page.
 		# @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
 		# @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
 		# @option opts [String] :until A node-name stopping point for the parsing.
+		# @option opts [String] :language The language of the dump this article was read from.
 		def initialize (opts={})
+			@language = opts[:language]
 			@title    = @article      = @redirect_title      = ""
 			@redirect = @special_page = @disambiguation_page = false
 			@internal_links, @page_type = [], nil
@@ -89,7 +92,7 @@ class WikiParser
 					if name_match
 						name_match = name_match[:name].gsub('_', ' ')
 						link_match = link_match ? link_match[:name] : name_match
-						links << {:uri => name_match, :title => link_match}
+						links << {:uri => name_match, :title => {@language => link_match}}
 					end
 				end
 			end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wikipedia_parser
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.2.0
 platform: ruby
 authors:
 - Jonathan Raiman