wikipedia_parser 1.0.3 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZmE5MjA2NGQ4NTg5Zjc2OGQ5NzYwNWRjZGU3ZTM5MjM4MDA0ZDI0ZQ==
4
+ YzAxOTllY2I0ZmY0NTI1ZjFiMDUzN2EzNmJiMzVmYzljZDE1MDY3Nw==
5
5
  data.tar.gz: !binary |-
6
- YjVjMTUzZjI5ZmQzNDU4YzM2ZTcwMmEzYzlhMmViOTkwNzg0ZTJlOQ==
6
+ MTFiOTI2NzNiOWI0MDEyOTkwODllZTQwYmNhYjAzN2Q0YWJhZWIxMw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- N2JkYTkxMjc4MmEwN2VhMjU5ZTZlMjc4ZTJkNzdlMmJhYjUxN2ZiMzc4NTVj
10
- ZDczMWUxNTEwMWIyY2Q1NjVjYzZjMGEzYTY0MmE5NDE3MTVkNTU0OGY3NDJi
11
- OTViMzRkN2QzMWU2YmE0NzY2ZWM4M2I2NzFjYTQ3N2QzZjY3YWQ=
9
+ ODI1YjBmMGIwOTYyZDlmNDYwMmNjMTNjOTgyZGMyMTY5ZTdhM2ZmMGEyZTY4
10
+ NjU1YTVkNDU4OTRkOGMzMzk1ZDA5NmFjNTgwYjIzNmZiMmY0Y2Y1NDk5YWU1
11
+ YzVkZjllZGU0NDE1MDU0NGI3NGJkZjI5MTU5ZWYyOGNmZWQzZjc=
12
12
  data.tar.gz: !binary |-
13
- ZGM0NGY4Y2RkZGYzYTk1MGRlYjY4OTRjNzFiNGEzMDQwMmVkMjY0M2U1OTJh
14
- NDVjNTlhMDM1MDQwYzZhYzYyNTA2ZGI0NzE4MTJhOGUzZmMzODBmZGQ4YzVi
15
- MTc5MTk4MTE5OWQ4OTM3ODU5ZjZlYWRjMWIwZWQyMjMyNzYwMjA=
13
+ Y2M5YzYwNGNhODA4MWY5YzdjMDc4Y2FjODJiMTgyYjMxZDAwYTE0NGU1NTlm
14
+ Y2IyMTkyMzQ2NDYyNDllNzY0ZmYxOWE3ODRhMzk3M2M1ZmRmOTMxZGFkY2Zh
15
+ YjdjYzM3ZmI3NWI2MDU2NDdlZjE4YTVlZGM2ZGEyMTM5ZTFlMjM=
data/lib/wikiParser.rb ADDED
@@ -0,0 +1,68 @@
1
+ # coding: utf-8
2
+ require 'nokogiri'
3
+ require 'fileutils'
4
+ require(File.dirname(__FILE__)+'/wikiParserPage.rb')
5
+
6
+ # Parses a Wikipedia dump and extracts internal links, content, and page type.
7
+ class WikiParser
8
+
9
+ # path to the Wikipedia dump.
10
+ attr_reader :path
11
+
12
+ # Convert the opened path to a dump to an enumerator of {WikiParser::Page}
13
+ # @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
14
+ def parse
15
+ @xml_file = File.open(@path)
16
+ @file = Nokogiri::XML::Reader((@path.match(/.+\.bz2/) ? (require 'bzip2';Bzip2::Reader.open(@path)) : @xml_file), nil, 'utf-8', Nokogiri::XML::ParseOptions::NOERROR)
17
+ @reader = @file.to_enum
18
+ end
19
+
20
+ # Convert the opened path to a dump to an enumerator of {WikiParser::Page}
21
+ # @param opts [Hash] the parameters to parse a wikipedia page.
22
+ # @option opts [String] :path The path to the Wikipedia dump in .xml or .bz2 format.
23
+ # @return [Enumerator<Nokogiri::XML::Node>] the enumerator.
24
+ def initialize (opts = {})
25
+ @file, new_path = nil, opts[:path]
26
+ if File.exists? new_path and !File.directory? new_path
27
+ @path = new_path
28
+ parse
29
+ else
30
+ raise ArgumentError.new "Cannot open file. Check path please."
31
+ end
32
+ end
33
+
34
+ # Closes the file reader.
35
+ def close; @xml_file.close if @xml_file; end
36
+
37
+ # Skips a {WikiParser::Page} in the enumeration
38
+ def skip
39
+ begin
40
+ node = @reader.next
41
+ if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
42
+ else
43
+ skip
44
+ end
45
+ rescue StopIteration
46
+ nil
47
+ end
48
+ end
49
+
50
+ # Reads the next node in the xml tree and returns it as a {WikiParser#::Page} if it exists.
51
+ # @return [WikiParser::Page, NilClass] A page if found.
52
+ # @param opts [Hash] the parameters to instantiate a page.
53
+ # @option opts [String] :until A node-name stopping point for the parsing. (Useful for not parsing an entire page until some property is checked.)
54
+ # @see Page#finish_processing
55
+ def get_next_page(opts={})
56
+ begin
57
+ node = @reader.next
58
+ if node.name == "page" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
59
+ xml = Nokogiri::XML::parse("<page>"+node.inner_xml+"</page>").first_element_child
60
+ return WikiParser::Page.new({:node => xml}.merge(opts))
61
+ else
62
+ get_next_page(opts)
63
+ end
64
+ rescue StopIteration, NoMethodError
65
+ nil
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,98 @@
1
+ class WikiParser
2
+
3
+ # A Wikipedia article page object.
4
+ class Page
5
+
6
+ # The Wikipedia namespaces for all special pages {WikiParser::Page#special_page}, {#page_type}.
7
+ Namespaces = %w(WP Help Talk User Template Wikipedia File Book Portal TimedText Module MediaWiki Special Media Category)
8
+ # Title of the Wikipedia article.
9
+ attr_reader :title
10
+ # The Wikipedia id of the article.
11
+ attr_reader :id
12
+ attr_reader :internal_links
13
+ # the content of the Wikipedia article
14
+ attr_reader :article
15
+ # is this page a redirection page?
16
+ attr_reader :redirect
17
+ # the title of the page this article redirects to.
18
+ attr_reader :redirect_title
19
+ # the wikipedia namespace for this page
20
+ attr_reader :page_type
21
+ # is this page `special`? Is it in the {Namespaces}?
22
+ attr_reader :special_page
23
+ attr_reader :disambiguation_page
24
+
25
+ # Create a new article page from an XML node.
26
+ # @param opts [Hash] the parameters to instantiate a page.
27
+ # @option opts [Nokogiri::XML::Node] :node the {http://rubydoc.info/gems/nokogiri/frames Nokogiri::XML::Node} containing the article.
28
+ # @option opts [Fixnum] :from the index from which to resume parsing among the nodes.
29
+ # @option opts [String] :until A node-name stopping point for the parsing.
30
+ def initialize (opts={})
31
+ @title = @article = @redirect_title = ""
32
+ @redirect = @special_page = @disambiguation_page = false
33
+ @internal_links, @page_type = [], nil
34
+ return unless !opts[:node].nil?
35
+ process_node opts
36
+ trigs = article_to_internal_links(@article)
37
+ @internal_links = trigs
38
+ end
39
+
40
+ def process_node(opts={})
41
+ opts[:node].element_children.each_with_index do |node,k|
42
+ if opts[:from] and k < opts[:from] then next end
43
+ case node.name
44
+ when 'id'
45
+ @id = node.content
46
+ when 'title'
47
+ @title = node.content
48
+ if @title.match(/(#{Namespaces.join("|")}):.+/i) then @special_page = true and @page_type = $1 end
49
+ if @title.match(/.+ \(disambiguation\)/i) then @disambiguation_page = true end
50
+ when 'redirect'
51
+ @redirect = true
52
+ @redirect_title = node["title"]
53
+ when 'revision'
54
+ node.element_children.each do |rev_node|
55
+ if rev_node.name == 'text'
56
+ @article = rev_node.content
57
+ end
58
+ end
59
+ end
60
+ if opts[:until] and opts[:until] == node.name
61
+ @stop_index = k
62
+ break
63
+ end
64
+ end
65
+ end
66
+
67
+ # Extracts internals links from a wikipedia article into an array of `uri`s and `title`s, starting
68
+ # from the stopping point given to the parser earlier.
69
+ # @return [WikiParser::Page] the parser.
70
+ def finish_processing
71
+ @stop_index||= 0
72
+ process_node :node => @node, :from => @stop_index
73
+ self
74
+ end
75
+
76
+ # Extracts internals links from a wikipedia article into an array of `uri`s and `title`s:
77
+ # @param article [String] the article content to extract links from.
78
+ # @return [Array<Hash>] the internal links in hash form.
79
+ def article_to_internal_links article
80
+ links = []
81
+ matches = article.scan(/\[\[(?<name>[^\]\|:]+)(?<trigger>\|[^\]]+)?\]\]/)
82
+ if matches
83
+ matches.each do |match|
84
+ name_match = match[0].strip.chomp.match(/^(?<name>[^#]+)(?<hashtag>#.+)?/)
85
+ link_match = match[1] ? match[1].strip.chomp.match(/^\|[\t\n\s\/]*(?<name>[^#]+)(?<hashtag>#.+)?/) : name_match
86
+ if name_match
87
+ name_match = name_match[:name].gsub('_', ' ')
88
+ link_match = link_match ? link_match[:name] : name_match
89
+ links << {:uri => name_match, :title => link_match}
90
+ end
91
+ end
92
+ end
93
+ links
94
+ end
95
+
96
+ private :process_node
97
+ end
98
+ end
@@ -0,0 +1 @@
1
+ require(File.dirname(__FILE__)+'/wikiParser.rb')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikipedia_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raiman
@@ -58,7 +58,10 @@ email: jraiman@mit.edu
58
58
  executables: []
59
59
  extensions: []
60
60
  extra_rdoc_files: []
61
- files: []
61
+ files:
62
+ - lib/wikiParser.rb
63
+ - lib/wikiParserPage.rb
64
+ - lib/wikipedia_parser.rb
62
65
  homepage: http://github.org/JonathanRaiman/wikipedia_parser
63
66
  licenses:
64
67
  - MIT