wikipedia_parser 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZmE5MjA2NGQ4NTg5Zjc2OGQ5NzYwNWRjZGU3ZTM5MjM4MDA0ZDI0ZQ==
4
+ YzAxOTllY2I0ZmY0NTI1ZjFiMDUzN2EzNmJiMzVmYzljZDE1MDY3Nw==
5
5
  data.tar.gz: !binary |-
6
- YjVjMTUzZjI5ZmQzNDU4YzM2ZTcwMmEzYzlhMmViOTkwNzg0ZTJlOQ==
6
+ MTFiOTI2NzNiOWI0MDEyOTkwODllZTQwYmNhYjAzN2Q0YWJhZWIxMw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- N2JkYTkxMjc4MmEwN2VhMjU5ZTZlMjc4ZTJkNzdlMmJhYjUxN2ZiMzc4NTVj
10
- ZDczMWUxNTEwMWIyY2Q1NjVjYzZjMGEzYTY0MmE5NDE3MTVkNTU0OGY3NDJi
11
- OTViMzRkN2QzMWU2YmE0NzY2ZWM4M2I2NzFjYTQ3N2QzZjY3YWQ=
9
+ ODI1YjBmMGIwOTYyZDlmNDYwMmNjMTNjOTgyZGMyMTY5ZTdhM2ZmMGEyZTY4
10
+ NjU1YTVkNDU4OTRkOGMzMzk1ZDA5NmFjNTgwYjIzNmZiMmY0Y2Y1NDk5YWU1
11
+ YzVkZjllZGU0NDE1MDU0NGI3NGJkZjI5MTU5ZWYyOGNmZWQzZjc=
12
12
  data.tar.gz: !binary |-
13
- ZGM0NGY4Y2RkZGYzYTk1MGRlYjY4OTRjNzFiNGEzMDQwMmVkMjY0M2U1OTJh
14
- NDVjNTlhMDM1MDQwYzZhYzYyNTA2ZGI0NzE4MTJhOGUzZmMzODBmZGQ4YzVi
15
- MTc5MTk4MTE5OWQ4OTM3ODU5ZjZlYWRjMWIwZWQyMjMyNzYwMjA=
13
+ Y2M5YzYwNGNhODA4MWY5YzdjMDc4Y2FjODJiMTgyYjMxZDAwYTE0NGU1NTlm
14
+ Y2IyMTkyMzQ2NDYyNDllNzY0ZmYxOWE3ODRhMzk3M2M1ZmRmOTMxZGFkY2Zh
15
+ YjdjYzM3ZmI3NWI2MDU2NDdlZjE4YTVlZGM2ZGEyMTM5ZTFlMjM=
data/lib/wikiParser.rb ADDED
@@ -0,0 +1,68 @@
1
+ # coding: utf-8
2
+ require 'nokogiri'
3
+ require 'fileutils'
4
+ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
5
+
6
+ # Parses a Wikipedia dump and extracts internal links, content, and page type.
7
+ class WikiParser
8
+
9
+ # path to the Wikipedia dump.
10
+ attr_reader :path
11
+
12
+ # Convert the opened path to a dump to an enumerator of {WikiParser::Page}
13
+ # @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
14
+ def parse
15
+ @xml_file = File.open(@path)
16
+ @file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
17
+ @reader = @file.to_enum
18
+ end
19
+
20
+ # Convert the opened path to a dump to an enumerator of {WikiParser::Page}
21
+ # @param opts [Hash] the parameters to parse a wikipedia page.
22
+ # @option opts [String] :path The path to the Wikipedia dump in .xml or .bz2 format.
23
+ # @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
24
+ def initialize (opts = {})
25
+ @file, new_path = nil, opts[:path]
26
+ if File.exists? new_path and !File.directory? new_path
27
+ @path = new_path
28
+ parse
29
+ else
30
+ raise ArgumentError.new "Cannot open file. Check path please."
31
+ end
32
+ end
33
+
34
+ # Closes the file reader.
35
+ def close; @xml_file.close if @xml_file; end
36
+
37
+ # Skips a {WikiParser::Page} in the enumeration
38
+ def skip
39
+ begin
40
+ node = @reader.next
41
+ if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
42
+ else
43
+ skip
44
+ end
45
+ rescue StopIteration
46
+ nil
47
+ end
48
+ end
49
+
50
+ # Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
51
+ # @return [WikiParser::Page, NilClass] A page if found.
52
+ # @param opts [Hash] the parameters to instantiate a page.
53
+ # @option opts [String] :until A node-name stopping point for the parsing. (Useful for not parsing an entire page until some property is checked.)
54
+ # @see Page#finish_processing
55
+ def get_next_page(opts={})
56
+ begin
57
+ node = @reader.next
58
+ if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
59
+ xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
60
+ return WikiParser::Page.new({:node => xml}.merge(opts))
61
+ else
62
+ get_next_page(opts)
63
+ end
64
+ rescue StopIteration, NoMethodError
65
+ nil
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,98 @@
1
+ class WikiParser
2
+
3
+ # A Wikipedia article page object.
4
+ class Page
5
+
6
+ # The Wikipedia namespaces for all special pages {WikiParser::Page#special_page}, {#page_type}.
7
+ Namespaces = %w(WP Help Talk User Template Wikipedia File Book Portal TimedText Module MediaWiki Special Media Category)
8
+ # Title of the Wikipedia article.
9
+ attr_reader :title
10
+ # The Wikipedia id of the article.
11
+ attr_reader :id
12
+ attr_reader :internal_links
13
+ # the content of the Wikipedia article
14
+ attr_reader :article
15
+ # is this page a redirection page?
16
+ attr_reader :redirect
17
+ # the title of the page this article redirects to.
18
+ attr_reader :redirect_title
19
+ # the wikipedia namespace for this page
20
+ attr_reader :page_type
21
+ # is this page `special`? Is it in the {Namespaces}?
22
+ attr_reader :special_page
23
+ attr_reader :disambiguation_page
24
+
25
+ # Create a new article page from an XML node.
26
+ # @param opts [Hash] the parameters to instantiate a page.
27
+ # @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
28
+ # @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
29
+ # @option opts [String] :until A node-name stopping point for the parsing.
30
+ def initialize (opts={})
31
+ @title = @article = @redirect_title = ""
32
+ @redirect = @special_page = @disambiguation_page = false
33
+ @internal_links, @page_type = [], nil
34
+ return unless !opts[:node].nil?
35
+ process_node opts
36
+ trigs = article_to_internal_links(@article)
37
+ @internal_links = trigs
38
+ end
39
+
40
+ def process_node(opts={})
41
+ opts[:node].element_children.each_with_index do |node,k|
42
+ if opts[:from] and k < opts[:from] then next end
43
+ case node.name
44
+ when 'id'
45
+ @id = node.content
46
+ when 'title'
47
+ @title = node.content
48
+ if @title.match(/(#{Namespaces.join("|")}):.+/i) then @special_page = true and @page_type = $1 end
49
+ if @title.match(/.+ \(disambiguation\)/i) then @disambiguation_page = true end
50
+ when 'redirect'
51
+ @redirect = true
52
+ @redirect_title = node["title"]
53
+ when 'revision'
54
+ node.element_children.each do |rev_node|
55
+ if rev_node.name == 'text'
56
+ @article = rev_node.content
57
+ end
58
+ end
59
+ end
60
+ if opts[:until] and opts[:until] == node.name
61
+ @stop_index = k
62
+ break
63
+ end
64
+ end
65
+ end
66
+
67
+ # Extracts internals links from a wikipedia article into an array of `uri`s and `title`s, starting
68
+ # from the stopping point given to the parser earlier.
69
+ # @return [WikiParser::Page] the parser.
70
+ def finish_processing
71
+ @stop_index||= 0
72
+ process_node :node => @node, :from => @stop_index
73
+ self
74
+ end
75
+
76
+ # Extracts internals links from a wikipedia article into an array of `uri`s and `title`s:
77
+ # @param article [String] the article content to extract links from.
78
+ # @return [Array<Hash>] the internal links in hash form.
79
+ def article_to_internal_links article
80
+ links = []
81
+ matches = article.scan(/\[\[(?<name>[^\]\|:]+)(?<trigger>\|[^\]]+)?\]\]/)
82
+ if matches
83
+ matches.each do |match|
84
+ name_match = match[0].strip.chomp.match(/^(?<name>[^#]+)(?<hashtag>#.+)?/)
85
+ link_match = match[1] ? match[1].strip.chomp.match(/^\|[\t\n\s\/]*(?<name>[^#]+)(?<hashtag>#.+)?/) : name_match
86
+ if name_match
87
+ name_match = name_match[:name].gsub('_', ' ')
88
+ link_match = link_match ? link_match[:name] : name_match
89
+ links << {:uri => name_match, :title => link_match}
90
+ end
91
+ end
92
+ end
93
+ links
94
+ end
95
+
96
+ private :process_node
97
+ end
98
+ end
@@ -0,0 +1 @@
1
+ require(File.dirname(__FILE__)+'/wikiParser.rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raiman
@@ -58,7 +58,10 @@ email: jraiman@mit.edu
58
58
  executables: []
59
59
  extensions: []
60
60
  extra_rdoc_files: []
61
- files: []
61
+ files:
62
+ - lib/wikiParser.rb
63
+ - lib/wikiParserPage.rb
64
+ - lib/wikipedia_parser.rb
62
65
  homepage: http://github.org/JonathanRaiman/wikipedia_parser
63
66
  licenses:
64
67
  - MIT