wikipedia_parser 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZTQyNDgzOGQ4M2ZmYzljMjZhYWZmZjNlMmU2NmUyNGViNDk2MzYzYQ==
4
+ MDg2OTllOGVkNTYyYWM3ZDI1NzhmMmUzZjQyZGQ2ODQyMzNmY2FmOQ==
5
5
  data.tar.gz: !binary |-
6
- ZTFmMmE5MmJjNDA0Y2YyN2RlOGY1Y2E3OTJmMGRmNDRhY2YxZDZiMA==
6
+ ZmEwOTlmMTg4NTU5OGIwMjY4ZDBkNGYxMjQ0YzYxOGQzMzFlZmJjOA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MGQwZWM1ZGFiMjA4ZDVhNzUzMmJlMDVhNDRjY2EyYTBjOGExNTQ4Y2ZlNmYx
10
- NTJjMTQ0ODU0YWQwNmQ4YWQ1YmZiMGYzYTM4YTAxMDZiNTVmNjk2ZGU3YWZj
11
- OThhZjgwNDI4YjJiOTJhODY4OTRlMTAzMTA0OWRjYTBmNWFkYjg=
9
+ ODRhMTk3YTg1ZjZjOTllMmM3NmNlOTcxMTgwZDNjNDg4YTY3YmI1MjA1YTk2
10
+ MGMxMmJiYzFjZTRmYTI4MzlkODI3ZDYzMzE5ODgzNTdiZDhmYWYzMGJiY2Vk
11
+ OTMzZWQyZGNiNmIxOTUwZDMyYTljMzM4NjNjMGEzM2MxMGQ4Yjc=
12
12
  data.tar.gz: !binary |-
13
- NTFlM2M5MjJiNzRhMjdkYzUyMThiNzdmMzg3YzJjMWU2MTliNDM3MzlkMDk1
14
- MzA1MjMwNjRmNzI3ZDk2MjcxMWI2ZDc0MGM1NzhlYmRjM2I5NTBkMTcyMzM0
15
- NDk4ZTA1Njg1NzgwZDg5YTUxMTg3MzdiMjk3ZjAxYmVkZDcyYzE=
13
+ NWU4YzNhNTUwNjVkMWE3YTU4Yjk1ZjNjN2I4YTRjMzljMGNiMjZlMWJlNGNi
14
+ NTZjOTVhZTc4NjUzNTQyOWY5NDczOGY0OTVjMDAxY2NlMThiYWU4ZGI0MDcy
15
+ ZjAxOTE1OTYzNWNjNjcxNzgxMDYyMjc4ZjU2NDEyOGFlMWM3ZDg=
data/lib/wikiParser.rb CHANGED
@@ -5,13 +5,16 @@ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
5
5
 
6
6
  # Parses a Wikipedia dump and extracts internal links, content, and page type.
7
7
  class WikiParser
8
+ LanguageNodePropertyName = "xml:lang"
8
9
 
9
10
  # path to the Wikipedia dump.
10
11
  attr_reader :path
12
+ # Language of the dump (e.g: "en","fr","ru",etc..)
13
+ attr_reader :language
11
14
 
12
15
  # Convert the opened path to a dump to an enumerator of {WikiParser::Page}
13
16
  # @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
14
- def parse
17
+ def prepare_enumerator
15
18
  @xml_file = File.open(@path)
16
19
  @file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
17
20
  @reader = @file.to_enum
@@ -25,7 +28,8 @@ class WikiParser
25
28
  @file, new_path = nil, opts[:path]
26
29
  if File.exists? new_path and !File.directory? new_path
27
30
  @path = new_path
28
- parse
31
+ prepare_enumerator
32
+ get_language
29
33
  else
30
34
  raise ArgumentError.new "Cannot open file. Check path please."
31
35
  end
@@ -47,6 +51,21 @@ class WikiParser
47
51
  end
48
52
  end
49
53
 
54
+ # Obtains the language by reading the 'xml:lang' property in the xml of the dump.
55
+ # @return [String] the language of the dump.
56
+ def get_language
57
+ begin
58
+ node = @reader.next
59
+ if node.name == "mediawiki" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
60
+ @language = node.attribute(LanguageNodePropertyName)
61
+ else
62
+ get_language
63
+ end
64
+ rescue StopIteration, NoMethodError
65
+ nil
66
+ end
67
+ end
68
+
50
69
  # Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
51
70
  # @return [WikiParser::Page, NilClass] A page if found.
52
71
  # @param opts [Hash] the parameters to instantiate a page.
@@ -57,7 +76,7 @@ class WikiParser
57
76
  node = @reader.next
58
77
  if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
59
78
  xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
60
- return WikiParser::Page.new({:node => xml}.merge(opts))
79
+ return WikiParser::Page.new({:node => xml, :language => @language}.merge(opts))
61
80
  else
62
81
  get_next_page(opts)
63
82
  end
@@ -21,13 +21,16 @@ class WikiParser
21
21
  # is this page `special`? Is it in the {Namespaces}?
22
22
  attr_reader :special_page
23
23
  attr_reader :disambiguation_page
24
+ attr_reader :language
24
25
 
25
26
  # Create a new article page from an XML node.
26
27
  # @param opts [Hash] the parameters to instantiate a page.
27
28
  # @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
28
29
  # @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
29
30
  # @option opts [String] :until A node-name stopping point for the parsing.
31
+ # @option opts [String] :language The language of the dump this article was read from.
30
32
  def initialize (opts={})
33
+ @language = opts[:language]
31
34
  @title = @article = @redirect_title = ""
32
35
  @redirect = @special_page = @disambiguation_page = false
33
36
  @internal_links, @page_type = [], nil
@@ -89,7 +92,7 @@ class WikiParser
89
92
  if name_match
90
93
  name_match = name_match[:name].gsub('_', ' ')
91
94
  link_match = link_match ? link_match[:name] : name_match
92
- links << {:uri => name_match, :title => link_match}
95
+ links << {:uri => name_match, :title => {@language => link_match}}
93
96
  end
94
97
  end
95
98
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raiman