wiki-api 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/.rubocop.yml +24 -0
- data/.travis.yml +12 -0
- data/Gemfile +2 -0
- data/README.md +60 -62
- data/Rakefile +13 -1
- data/bin/console +8 -0
- data/lib/wiki/api/connect.rb +48 -38
- data/lib/wiki/api/page.rb +35 -42
- data/lib/wiki/api/page_block.rb +16 -17
- data/lib/wiki/api/page_headline.rb +51 -50
- data/lib/wiki/api/page_link.rb +13 -14
- data/lib/wiki/api/page_list_item.rb +10 -13
- data/lib/wiki/api/util.rb +18 -20
- data/lib/wiki/api/version.rb +3 -1
- data/lib/wiki/api.rb +9 -8
- data/test/test_helper.rb +4 -7
- data/test/unit/wiki_connect.rb +18 -25
- data/test/unit/wiki_page_offline.rb +144 -111
- data/wiki-api.gemspec +20 -17
- metadata +53 -34
data/lib/wiki/api/page_block.rb
CHANGED
@@ -1,27 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# Collection of elements for segmented per headline
|
5
6
|
class PageBlock
|
6
|
-
|
7
7
|
attr_accessor :elements, :parent
|
8
8
|
|
9
|
-
def initialize
|
10
|
-
self.parent = options[:parent] if options.include?
|
9
|
+
def initialize(options = {})
|
10
|
+
self.parent = options[:parent] if options.include?(:parent)
|
11
11
|
self.elements = []
|
12
12
|
end
|
13
13
|
|
14
|
-
def <<
|
14
|
+
def <<(value)
|
15
15
|
# value.first.previous.name
|
16
|
-
|
16
|
+
elements << value
|
17
17
|
end
|
18
18
|
|
19
19
|
def to_texts
|
20
20
|
texts = []
|
21
|
-
|
22
|
-
text = Wiki::Api::Util.element_to_text
|
21
|
+
elements.flatten.each do |element|
|
22
|
+
text = Wiki::Api::Util.element_to_text(element) if element.is_a?(Nokogiri::XML::Element)
|
23
23
|
next if text.nil?
|
24
24
|
next if text.empty?
|
25
|
+
|
25
26
|
texts << text
|
26
27
|
end
|
27
28
|
texts
|
@@ -29,27 +30,25 @@ module Wiki
|
|
29
30
|
|
30
31
|
def list_items
|
31
32
|
# TODO: perhaps we should wrap the elements with objects, and request a li per element??
|
32
|
-
|
33
|
-
PageListItem.new
|
33
|
+
search('li').map do |list_item|
|
34
|
+
PageListItem.new(parent: self, element: list_item)
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
38
|
def links
|
38
39
|
# TODO: perhaps we should wrap the elements with objects, and request a li per element??
|
39
|
-
|
40
|
-
PageLink.new
|
40
|
+
search('a').map do |a|
|
41
|
+
PageLink.new(parent: self, element: a)
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
45
|
protected
|
45
46
|
|
46
47
|
def search *paths
|
47
|
-
|
48
|
+
elements.flatten.flat_map do |element|
|
48
49
|
element.search(*paths)
|
49
|
-
end.reject
|
50
|
+
end.reject(&:nil?)
|
50
51
|
end
|
51
|
-
|
52
52
|
end
|
53
|
-
|
54
53
|
end
|
55
|
-
end
|
54
|
+
end
|
@@ -1,117 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# Headline for a page (class="mw-healine")
|
5
6
|
class PageHeadline
|
6
|
-
|
7
7
|
require 'json'
|
8
8
|
|
9
|
-
LEVEL = [
|
9
|
+
LEVEL = %w[text h1 h2 h3 h4 h5 h6].freeze
|
10
10
|
|
11
11
|
attr_accessor :name, :block, :parent, :headlines, :level
|
12
12
|
|
13
|
-
def initialize
|
14
|
-
self.name = options[:name] if options.include?
|
15
|
-
self.parent = options[:parent] if options.include?
|
16
|
-
self.level = options[:level] if options.include?
|
13
|
+
def initialize(options = {})
|
14
|
+
self.name = options[:name] if options.include?(:name)
|
15
|
+
self.parent = options[:parent] if options.include?(:parent)
|
16
|
+
self.level = options[:level] if options.include?(:level)
|
17
17
|
options[:headlines] ||= []
|
18
18
|
self.headlines ||= {}
|
19
19
|
|
20
20
|
# store elements in a block
|
21
|
-
self.block = PageBlock.new
|
22
|
-
if options[:headlines].include?
|
23
|
-
options[:headlines][
|
24
|
-
|
21
|
+
self.block = PageBlock.new(parent: self)
|
22
|
+
if options[:headlines].include?(name)
|
23
|
+
options[:headlines][name].each do |element|
|
24
|
+
block << element
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
28
|
# collect nested headlines
|
29
29
|
headlines = options[:headlines]
|
30
30
|
# remove self from list
|
31
|
-
headlines.delete
|
32
|
-
nested_headlines = self.nested_headlines
|
31
|
+
headlines.delete(name)
|
32
|
+
nested_headlines = self.nested_headlines(headlines, name, level)
|
33
33
|
|
34
34
|
# iterate nested headlines, and call recursive
|
35
35
|
nested_headlines.each do |headline_name, value|
|
36
|
-
level = LEVEL.index
|
37
|
-
self.headlines[headline_name] =
|
36
|
+
level = LEVEL.index(value.first.first.previous.name)
|
37
|
+
self.headlines[headline_name] =
|
38
|
+
PageHeadline.new(parent: self, name: headline_name, headlines:, level:)
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
42
|
def elements
|
42
|
-
|
43
|
+
block.elements
|
43
44
|
end
|
44
45
|
|
45
46
|
def type
|
46
|
-
|
47
|
+
block.elements.first.first.previous.name
|
47
48
|
end
|
48
49
|
|
49
50
|
# get headline by name
|
50
|
-
def headline
|
51
|
-
name = name.downcase.gsub(
|
52
|
-
self.headlines.
|
53
|
-
|
54
|
-
end.values
|
51
|
+
def headline(name)
|
52
|
+
name = name.downcase.gsub(' ', '_')
|
53
|
+
self.headlines.select do |k, _v|
|
54
|
+
k.downcase.start_with?(name)
|
55
|
+
end.values
|
55
56
|
end
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
58
|
+
def headline_in_depth(name, depth = 1)
|
59
|
+
name = name.downcase.gsub(' ', '_')
|
60
|
+
ret = []
|
61
|
+
|
62
|
+
self.headlines.each do |k, v|
|
63
|
+
ret << v if k.downcase.start_with?(name)
|
64
|
+
next if v.headlines.empty?
|
65
|
+
|
66
|
+
if depth.positive?
|
67
|
+
q = v.headline_in_depth(name, (depth - 1))
|
68
|
+
ret.concat(q)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
ret
|
72
|
+
end
|
71
73
|
|
72
74
|
# headline exists for current headline
|
73
|
-
def has_headline?
|
74
|
-
name = name.downcase.gsub(
|
75
|
-
self.headlines.
|
75
|
+
def has_headline?(name)
|
76
|
+
name = name.downcase.gsub(' ', '_')
|
77
|
+
self.headlines.each_key do |k|
|
76
78
|
return true if k.downcase.start_with?(name)
|
77
79
|
end
|
78
80
|
false
|
79
81
|
end
|
80
82
|
|
81
83
|
def to_hash
|
82
|
-
ret = {
|
83
|
-
self.headlines.
|
84
|
+
ret = { name:, headlines: [], type: }
|
85
|
+
self.headlines.each_value do |headline|
|
84
86
|
ret[:headlines] << headline.to_hash
|
85
87
|
end
|
86
88
|
ret
|
87
89
|
end
|
88
90
|
|
89
91
|
def to_pretty_json
|
90
|
-
JSON.pretty_generate
|
92
|
+
JSON.pretty_generate(to_hash)
|
91
93
|
end
|
92
94
|
|
93
|
-
protected
|
95
|
+
protected
|
94
96
|
|
95
97
|
# filter nested headlines (elements) from a parent headline (by name)
|
96
|
-
def nested_headlines
|
98
|
+
def nested_headlines(headlines, _name, original_level)
|
97
99
|
ret = {}
|
98
100
|
init_level = nil
|
99
101
|
# iterate headlines, skip already done onces
|
100
|
-
#headlines.drop(headline_index + 1).each do |headline|
|
102
|
+
# headlines.drop(headline_index + 1).each do |headline|
|
101
103
|
headlines.to_a.each do |name, value|
|
102
|
-
level = LEVEL.index
|
103
|
-
init_level ||= level
|
104
|
+
level = LEVEL.index(value.first.first.previous.name)
|
105
|
+
init_level ||= level
|
104
106
|
# lower level indicate nest end
|
105
107
|
break if level <= original_level
|
106
108
|
break if level < init_level
|
107
109
|
# higher level indicates nested items, these will be processed recursive
|
108
110
|
next if init_level != level
|
111
|
+
|
109
112
|
ret[name] = value
|
110
113
|
end
|
111
114
|
ret
|
112
115
|
end
|
113
|
-
|
114
116
|
end
|
115
|
-
|
116
117
|
end
|
117
|
-
end
|
118
|
+
end
|
data/lib/wiki/api/page_link.rb
CHANGED
@@ -1,38 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# Link on a wiki page (a href=xxx)
|
5
6
|
class PageLink
|
6
|
-
|
7
7
|
attr_accessor :element, :parent
|
8
8
|
|
9
|
-
def initialize
|
10
|
-
self.element = options[:element] if options.include?
|
11
|
-
self.parent = options[:parent] if options.include?
|
9
|
+
def initialize(options = {})
|
10
|
+
self.element = options[:element] if options.include?(:element)
|
11
|
+
self.parent = options[:parent] if options.include?(:parent)
|
12
12
|
end
|
13
13
|
|
14
14
|
def to_text
|
15
|
-
Wiki::Api::Util.element_to_text
|
15
|
+
Wiki::Api::Util.element_to_text(element)
|
16
16
|
end
|
17
17
|
|
18
18
|
def uri
|
19
19
|
# lookup the root parent, and get connector info
|
20
20
|
host = Wiki::Api::Util.parent_root(self).connect.uri
|
21
|
-
href_value =
|
22
|
-
URI.parse
|
21
|
+
href_value = element.attributes['href'].value
|
22
|
+
URI.parse("#{host}#{href_value}")
|
23
23
|
end
|
24
24
|
|
25
25
|
def title
|
26
26
|
# skip links with no title
|
27
|
-
return
|
28
|
-
|
27
|
+
return '' if element.attributes['title'].nil?
|
28
|
+
|
29
|
+
element.attributes['title'].value
|
29
30
|
end
|
30
31
|
|
31
32
|
def html
|
32
|
-
"<a href=\"#{
|
33
|
+
"<a href=\"#{uri}\" alt=\"#{title}\">#{title}</a>"
|
33
34
|
end
|
34
|
-
|
35
35
|
end
|
36
|
-
|
37
36
|
end
|
38
|
-
end
|
37
|
+
end
|
@@ -1,34 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# List Items on a Page (li=xxx)
|
5
6
|
class PageListItem
|
6
|
-
|
7
7
|
attr_accessor :element, :parent
|
8
8
|
|
9
|
-
def initialize
|
10
|
-
self.
|
11
|
-
self.
|
9
|
+
def initialize(options = {})
|
10
|
+
self.parent = options[:parent] if options.include?(:parent)
|
11
|
+
self.element = options[:element] if options.include?(:element)
|
12
12
|
end
|
13
13
|
|
14
14
|
def to_text
|
15
|
-
Wiki::Api::Util.element_to_text
|
15
|
+
Wiki::Api::Util.element_to_text(element)
|
16
16
|
end
|
17
17
|
|
18
18
|
def links
|
19
|
-
|
20
|
-
PageLink.new
|
19
|
+
search('a').map do |a|
|
20
|
+
PageLink.new(parent: self, element: a)
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
24
|
protected
|
25
25
|
|
26
26
|
def search *paths
|
27
|
-
|
27
|
+
element.search(*paths)
|
28
28
|
end
|
29
|
-
|
30
|
-
|
31
29
|
end
|
32
|
-
|
33
30
|
end
|
34
|
-
end
|
31
|
+
end
|
data/lib/wiki/api/util.rb
CHANGED
@@ -1,46 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
class Util
|
5
|
-
|
6
6
|
class << self
|
7
|
+
def element_to_text(element)
|
8
|
+
raise('not an element') unless element.is_a?(Nokogiri::XML::Element)
|
7
9
|
|
8
|
-
|
9
|
-
raise "not an element" unless element.is_a? Nokogiri::XML::Element
|
10
|
-
self.clean_text element.text
|
10
|
+
clean_text(element.text)
|
11
11
|
end
|
12
12
|
|
13
|
-
def element_filter_lists
|
14
|
-
raise
|
13
|
+
def element_filter_lists(element)
|
14
|
+
raise('not an element') unless element.is_a?(Nokogiri::XML::Element)
|
15
|
+
|
15
16
|
result = {}
|
16
|
-
element.search(
|
17
|
+
element.search('li').each_with_index do |li, i|
|
17
18
|
li.children.each do |child|
|
18
19
|
result[i] ||= []
|
19
|
-
result[i] <<
|
20
|
+
result[i] << clean_text(child.text)
|
20
21
|
end
|
21
22
|
end
|
22
|
-
result.map{ |
|
23
|
+
result.map { |_k, v| v.join('') }
|
23
24
|
end
|
24
25
|
|
25
|
-
def parent_root
|
26
|
+
def parent_root(current_object)
|
26
27
|
current = current_object
|
27
|
-
|
28
|
+
loop do
|
28
29
|
break if current.parent.nil?
|
30
|
+
|
29
31
|
current = current.parent
|
30
32
|
end
|
31
33
|
current
|
32
34
|
end
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
36
|
protected
|
38
|
-
def clean_text text
|
39
|
-
text.gsub(/\n/, " ").squeeze(" ").gsub(/\s(\W)/, '\1').gsub(/(\W)\s/, '\1 ').strip
|
40
|
-
end
|
41
37
|
|
38
|
+
def clean_text(text)
|
39
|
+
text.gsub(/\n/, ' ').squeeze(' ').gsub(/\s(\W)/, '\1').gsub(/(\W)\s/, '\1 ').strip
|
40
|
+
end
|
42
41
|
end
|
43
|
-
|
44
42
|
end
|
45
43
|
end
|
46
|
-
end
|
44
|
+
end
|
data/lib/wiki/api/version.rb
CHANGED
data/lib/wiki/api.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
-
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/connect")
|
3
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page")
|
4
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_headline")
|
5
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_block")
|
6
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_list_item")
|
7
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_link")
|
8
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/util")
|
1
|
+
# frozen_string_literal: true
|
9
2
|
|
3
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/version")
|
4
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/connect")
|
5
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page")
|
6
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_headline")
|
7
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_block")
|
8
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_list_item")
|
9
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_link")
|
10
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/util")
|
10
11
|
|
11
12
|
module Wiki
|
12
13
|
module Api
|
data/test/test_helper.rb
CHANGED
data/test/unit/wiki_connect.rb
CHANGED
@@ -1,51 +1,44 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'pry'
|
4
5
|
|
5
6
|
#
|
6
7
|
# Testing the connection to https://www.mediawiki.org/wiki/API:Main_page
|
7
8
|
#
|
8
9
|
|
9
10
|
class WikiConnect < Test::Unit::TestCase
|
10
|
-
|
11
|
-
CONFIG = { uri: "http://en.wiktionary.org" }
|
11
|
+
CONFIG = { uri: 'https://en.wiktionary.org' }.freeze
|
12
12
|
|
13
13
|
def setup
|
14
14
|
Wiki::Api::Connect.config = CONFIG
|
15
15
|
end
|
16
16
|
|
17
|
-
def teardown
|
18
|
-
end
|
17
|
+
def teardown; end
|
19
18
|
|
20
19
|
def test_connection_wiktionary
|
21
|
-
c = Wiki::Api::Connect.new
|
20
|
+
c = Wiki::Api::Connect.new(uri: 'http://en.wiktionary.org')
|
22
21
|
ret = c.connect
|
23
|
-
assert
|
22
|
+
assert(ret.is_a?(Net::HTTPMovedPermanently), 'invalid response http')
|
24
23
|
end
|
25
24
|
|
26
25
|
def test_connection_https_wiktionary
|
27
|
-
c = Wiki::Api::Connect.new
|
26
|
+
c = Wiki::Api::Connect.new(uri: 'https://en.wiktionary.org')
|
28
27
|
ret = c.connect
|
29
|
-
assert
|
28
|
+
assert(ret.is_a?(Net::HTTPOK), 'invalid response https')
|
30
29
|
end
|
31
30
|
|
32
31
|
def test_page_get
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
assert false, "expected valid page #{e.message}"
|
38
|
-
end
|
32
|
+
c = Wiki::Api::Connect.new
|
33
|
+
c.page('Wiktionary:Welcome,_newcomers')
|
34
|
+
rescue Exception => e
|
35
|
+
assert(false, "expected valid page #{e.message}")
|
39
36
|
end
|
40
37
|
|
41
38
|
def test_page_get_non_exist
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
assert (e.message == "missingtitle"), "expected invalid page #{e.message}"
|
47
|
-
end
|
39
|
+
c = Wiki::Api::Connect.new
|
40
|
+
c.page('asfsldkfjjlkanv98yhok')
|
41
|
+
rescue Exception => e
|
42
|
+
assert((e.message == 'missingtitle'), "expected invalid page #{e.message}")
|
48
43
|
end
|
49
|
-
|
50
|
-
|
51
44
|
end
|