wikipedia_parser 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZTQyNDgzOGQ4M2ZmYzljMjZhYWZmZjNlMmU2NmUyNGViNDk2MzYzYQ==
4
+ MDg2OTllOGVkNTYyYWM3ZDI1NzhmMmUzZjQyZGQ2ODQyMzNmY2FmOQ==
5
5
  data.tar.gz: !binary |-
6
- ZTFmMmE5MmJjNDA0Y2YyN2RlOGY1Y2E3OTJmMGRmNDRhY2YxZDZiMA==
6
+ ZmEwOTlmMTg4NTU5OGIwMjY4ZDBkNGYxMjQ0YzYxOGQzMzFlZmJjOA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MGQwZWM1ZGFiMjA4ZDVhNzUzMmJlMDVhNDRjY2EyYTBjOGExNTQ4Y2ZlNmYx
10
- NTJjMTQ0ODU0YWQwNmQ4YWQ1YmZiMGYzYTM4YTAxMDZiNTVmNjk2ZGU3YWZj
11
- OThhZjgwNDI4YjJiOTJhODY4OTRlMTAzMTA0OWRjYTBmNWFkYjg=
9
+ ODRhMTk3YTg1ZjZjOTllMmM3NmNlOTcxMTgwZDNjNDg4YTY3YmI1MjA1YTk2
10
+ MGMxMmJiYzFjZTRmYTI4MzlkODI3ZDYzMzE5ODgzNTdiZDhmYWYzMGJiY2Vk
11
+ OTMzZWQyZGNiNmIxOTUwZDMyYTljMzM4NjNjMGEzM2MxMGQ4Yjc=
12
12
  data.tar.gz: !binary |-
13
- NTFlM2M5MjJiNzRhMjdkYzUyMThiNzdmMzg3YzJjMWU2MTliNDM3MzlkMDk1
14
- MzA1MjMwNjRmNzI3ZDk2MjcxMWI2ZDc0MGM1NzhlYmRjM2I5NTBkMTcyMzM0
15
- NDk4ZTA1Njg1NzgwZDg5YTUxMTg3MzdiMjk3ZjAxYmVkZDcyYzE=
13
+ NWU4YzNhNTUwNjVkMWE3YTU4Yjk1ZjNjN2I4YTRjMzljMGNiMjZlMWJlNGNi
14
+ NTZjOTVhZTc4NjUzNTQyOWY5NDczOGY0OTVjMDAxY2NlMThiYWU4ZGI0MDcy
15
+ ZjAxOTE1OTYzNWNjNjcxNzgxMDYyMjc4ZjU2NDEyOGFlMWM3ZDg=
data/lib/wikiParser.rb CHANGED
@@ -5,13 +5,16 @@ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
5
5
 
6
6
  # Parses a Wikipedia dump and extracts internal links, content, and page type.
7
7
  class WikiParser
8
+ LanguageNodePropertyName = "xml:lang"
8
9
 
9
10
  # path to the Wikipedia dump.
10
11
  attr_reader :path
12
+ # Language of the dump (e.g: "en","fr","ru",etc..)
13
+ attr_reader :language
11
14
 
12
15
  # Convert the opened path to a dump to an enumerator of {WikiParser::Page}
13
16
  # @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
14
- def parse
17
+ def prepare_enumerator
15
18
  @xml_file = File.open(@path)
16
19
  @file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
17
20
  @reader = @file.to_enum
@@ -25,7 +28,8 @@ class WikiParser
25
28
  @file, new_path = nil, opts[:path]
26
29
  if File.exists? new_path and !File.directory? new_path
27
30
  @path = new_path
28
- parse
31
+ prepare_enumerator
32
+ get_language
29
33
  else
30
34
  raise ArgumentError.new "Cannot open file. Check path please."
31
35
  end
@@ -47,6 +51,21 @@ class WikiParser
47
51
  end
48
52
  end
49
53
 
54
+ # Obtains the language by reading the 'xml:lang' property in the xml of the dump.
55
+ # @return [String] the language of the dump.
56
+ def get_language
57
+ begin
58
+ node = @reader.next
59
+ if node.name == "mediawiki" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
60
+ @language = node.attribute(LanguageNodePropertyName)
61
+ else
62
+ get_language
63
+ end
64
+ rescue StopIteration, NoMethodError
65
+ nil
66
+ end
67
+ end
68
+
50
69
  # Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
51
70
  # @return [WikiParser::Page, NilClass] A page if found.
52
71
  # @param opts [Hash] the parameters to instantiate a page.
@@ -57,7 +76,7 @@ class WikiParser
57
76
  node = @reader.next
58
77
  if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
59
78
  xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
60
- return WikiParser::Page.new({:node => xml}.merge(opts))
79
+ return WikiParser::Page.new({:node => xml, :language => @language}.merge(opts))
61
80
  else
62
81
  get_next_page(opts)
63
82
  end
@@ -21,13 +21,16 @@ class WikiParser
21
21
  # is this page `special`? Is it in the {Namespaces}?
22
22
  attr_reader :special_page
23
23
  attr_reader :disambiguation_page
24
+ attr_reader :language
24
25
 
25
26
  # Create a new article page from an XML node.
26
27
  # @param opts [Hash] the parameters to instantiate a page.
27
28
  # @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
28
29
  # @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
29
30
  # @option opts [String] :until A node-name stopping point for the parsing.
31
+ # @option opts [String] :language The language of the dump this article was read from.
30
32
  def initialize (opts={})
33
+ @language = opts[:language]
31
34
  @title = @article = @redirect_title = ""
32
35
  @redirect = @special_page = @disambiguation_page = false
33
36
  @internal_links, @page_type = [], nil
@@ -89,7 +92,7 @@ class WikiParser
89
92
  if name_match
90
93
  name_match = name_match[:name].gsub('_', ' ')
91
94
  link_match = link_match ? link_match[:name] : name_match
92
- links << {:uri => name_match, :title => link_match}
95
+ links << {:uri => name_match, :title => {@language => link_match}}
93
96
  end
94
97
  end
95
98
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raiman