wiki-api 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/.rubocop.yml +24 -0
- data/.travis.yml +12 -0
- data/Gemfile +2 -0
- data/README.md +60 -62
- data/Rakefile +13 -1
- data/bin/console +8 -0
- data/lib/wiki/api/connect.rb +48 -38
- data/lib/wiki/api/page.rb +35 -42
- data/lib/wiki/api/page_block.rb +16 -17
- data/lib/wiki/api/page_headline.rb +51 -50
- data/lib/wiki/api/page_link.rb +13 -14
- data/lib/wiki/api/page_list_item.rb +10 -13
- data/lib/wiki/api/util.rb +18 -20
- data/lib/wiki/api/version.rb +3 -1
- data/lib/wiki/api.rb +9 -8
- data/test/test_helper.rb +4 -7
- data/test/unit/wiki_connect.rb +18 -25
- data/test/unit/wiki_page_offline.rb +144 -111
- data/wiki-api.gemspec +20 -17
- metadata +53 -34
data/lib/wiki/api/page_block.rb
CHANGED
@@ -1,27 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# Collection of elements for segmented per headline
|
5
6
|
class PageBlock
|
6
|
-
|
7
7
|
attr_accessor :elements, :parent
|
8
8
|
|
9
|
-
def initialize
|
10
|
-
self.parent = options[:parent] if options.include?
|
9
|
+
def initialize(options = {})
|
10
|
+
self.parent = options[:parent] if options.include?(:parent)
|
11
11
|
self.elements = []
|
12
12
|
end
|
13
13
|
|
14
|
-
def <<
|
14
|
+
def <<(value)
|
15
15
|
# value.first.previous.name
|
16
|
-
|
16
|
+
elements << value
|
17
17
|
end
|
18
18
|
|
19
19
|
def to_texts
|
20
20
|
texts = []
|
21
|
-
|
22
|
-
text = Wiki::Api::Util.element_to_text
|
21
|
+
elements.flatten.each do |element|
|
22
|
+
text = Wiki::Api::Util.element_to_text(element) if element.is_a?(Nokogiri::XML::Element)
|
23
23
|
next if text.nil?
|
24
24
|
next if text.empty?
|
25
|
+
|
25
26
|
texts << text
|
26
27
|
end
|
27
28
|
texts
|
@@ -29,27 +30,25 @@ module Wiki
|
|
29
30
|
|
30
31
|
def list_items
|
31
32
|
# TODO: perhaps we should wrap the elements with objects, and request a li per element??
|
32
|
-
|
33
|
-
PageListItem.new
|
33
|
+
search('li').map do |list_item|
|
34
|
+
PageListItem.new(parent: self, element: list_item)
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
38
|
def links
|
38
39
|
# TODO: perhaps we should wrap the elements with objects, and request a li per element??
|
39
|
-
|
40
|
-
PageLink.new
|
40
|
+
search('a').map do |a|
|
41
|
+
PageLink.new(parent: self, element: a)
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
45
|
protected
|
45
46
|
|
46
47
|
def search *paths
|
47
|
-
|
48
|
+
elements.flatten.flat_map do |element|
|
48
49
|
element.search(*paths)
|
49
|
-
end.reject
|
50
|
+
end.reject(&:nil?)
|
50
51
|
end
|
51
|
-
|
52
52
|
end
|
53
|
-
|
54
53
|
end
|
55
|
-
end
|
54
|
+
end
|
@@ -1,117 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# Headline for a page (class="mw-healine")
|
5
6
|
class PageHeadline
|
6
|
-
|
7
7
|
require 'json'
|
8
8
|
|
9
|
-
LEVEL = [
|
9
|
+
LEVEL = %w[text h1 h2 h3 h4 h5 h6].freeze
|
10
10
|
|
11
11
|
attr_accessor :name, :block, :parent, :headlines, :level
|
12
12
|
|
13
|
-
def initialize
|
14
|
-
self.name = options[:name] if options.include?
|
15
|
-
self.parent = options[:parent] if options.include?
|
16
|
-
self.level = options[:level] if options.include?
|
13
|
+
def initialize(options = {})
|
14
|
+
self.name = options[:name] if options.include?(:name)
|
15
|
+
self.parent = options[:parent] if options.include?(:parent)
|
16
|
+
self.level = options[:level] if options.include?(:level)
|
17
17
|
options[:headlines] ||= []
|
18
18
|
self.headlines ||= {}
|
19
19
|
|
20
20
|
# store elements in a block
|
21
|
-
self.block = PageBlock.new
|
22
|
-
if options[:headlines].include?
|
23
|
-
options[:headlines][
|
24
|
-
|
21
|
+
self.block = PageBlock.new(parent: self)
|
22
|
+
if options[:headlines].include?(name)
|
23
|
+
options[:headlines][name].each do |element|
|
24
|
+
block << element
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
28
|
# collect nested headlines
|
29
29
|
headlines = options[:headlines]
|
30
30
|
# remove self from list
|
31
|
-
headlines.delete
|
32
|
-
nested_headlines = self.nested_headlines
|
31
|
+
headlines.delete(name)
|
32
|
+
nested_headlines = self.nested_headlines(headlines, name, level)
|
33
33
|
|
34
34
|
# iterate nested headlines, and call recursive
|
35
35
|
nested_headlines.each do |headline_name, value|
|
36
|
-
level = LEVEL.index
|
37
|
-
self.headlines[headline_name] =
|
36
|
+
level = LEVEL.index(value.first.first.previous.name)
|
37
|
+
self.headlines[headline_name] =
|
38
|
+
PageHeadline.new(parent: self, name: headline_name, headlines:, level:)
|
38
39
|
end
|
39
40
|
end
|
40
41
|
|
41
42
|
def elements
|
42
|
-
|
43
|
+
block.elements
|
43
44
|
end
|
44
45
|
|
45
46
|
def type
|
46
|
-
|
47
|
+
block.elements.first.first.previous.name
|
47
48
|
end
|
48
49
|
|
49
50
|
# get headline by name
|
50
|
-
def headline
|
51
|
-
name = name.downcase.gsub(
|
52
|
-
self.headlines.
|
53
|
-
|
54
|
-
end.values
|
51
|
+
def headline(name)
|
52
|
+
name = name.downcase.gsub(' ', '_')
|
53
|
+
self.headlines.select do |k, _v|
|
54
|
+
k.downcase.start_with?(name)
|
55
|
+
end.values
|
55
56
|
end
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
58
|
+
def headline_in_depth(name, depth = 1)
|
59
|
+
name = name.downcase.gsub(' ', '_')
|
60
|
+
ret = []
|
61
|
+
|
62
|
+
self.headlines.each do |k, v|
|
63
|
+
ret << v if k.downcase.start_with?(name)
|
64
|
+
next if v.headlines.empty?
|
65
|
+
|
66
|
+
if depth.positive?
|
67
|
+
q = v.headline_in_depth(name, (depth - 1))
|
68
|
+
ret.concat(q)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
ret
|
72
|
+
end
|
71
73
|
|
72
74
|
# headline exists for current headline
|
73
|
-
def has_headline?
|
74
|
-
name = name.downcase.gsub(
|
75
|
-
self.headlines.
|
75
|
+
def has_headline?(name)
|
76
|
+
name = name.downcase.gsub(' ', '_')
|
77
|
+
self.headlines.each_key do |k|
|
76
78
|
return true if k.downcase.start_with?(name)
|
77
79
|
end
|
78
80
|
false
|
79
81
|
end
|
80
82
|
|
81
83
|
def to_hash
|
82
|
-
ret = {
|
83
|
-
self.headlines.
|
84
|
+
ret = { name:, headlines: [], type: }
|
85
|
+
self.headlines.each_value do |headline|
|
84
86
|
ret[:headlines] << headline.to_hash
|
85
87
|
end
|
86
88
|
ret
|
87
89
|
end
|
88
90
|
|
89
91
|
def to_pretty_json
|
90
|
-
JSON.pretty_generate
|
92
|
+
JSON.pretty_generate(to_hash)
|
91
93
|
end
|
92
94
|
|
93
|
-
protected
|
95
|
+
protected
|
94
96
|
|
95
97
|
# filter nested headlines (elements) from a parent headline (by name)
|
96
|
-
def nested_headlines
|
98
|
+
def nested_headlines(headlines, _name, original_level)
|
97
99
|
ret = {}
|
98
100
|
init_level = nil
|
99
101
|
# iterate headlines, skip already done onces
|
100
|
-
#headlines.drop(headline_index + 1).each do |headline|
|
102
|
+
# headlines.drop(headline_index + 1).each do |headline|
|
101
103
|
headlines.to_a.each do |name, value|
|
102
|
-
level = LEVEL.index
|
103
|
-
init_level ||= level
|
104
|
+
level = LEVEL.index(value.first.first.previous.name)
|
105
|
+
init_level ||= level
|
104
106
|
# lower level indicate nest end
|
105
107
|
break if level <= original_level
|
106
108
|
break if level < init_level
|
107
109
|
# higher level indicates nested items, these will be processed recursive
|
108
110
|
next if init_level != level
|
111
|
+
|
109
112
|
ret[name] = value
|
110
113
|
end
|
111
114
|
ret
|
112
115
|
end
|
113
|
-
|
114
116
|
end
|
115
|
-
|
116
117
|
end
|
117
|
-
end
|
118
|
+
end
|
data/lib/wiki/api/page_link.rb
CHANGED
@@ -1,38 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# Link on a wiki page (a href=xxx)
|
5
6
|
class PageLink
|
6
|
-
|
7
7
|
attr_accessor :element, :parent
|
8
8
|
|
9
|
-
def initialize
|
10
|
-
self.element = options[:element] if options.include?
|
11
|
-
self.parent = options[:parent] if options.include?
|
9
|
+
def initialize(options = {})
|
10
|
+
self.element = options[:element] if options.include?(:element)
|
11
|
+
self.parent = options[:parent] if options.include?(:parent)
|
12
12
|
end
|
13
13
|
|
14
14
|
def to_text
|
15
|
-
Wiki::Api::Util.element_to_text
|
15
|
+
Wiki::Api::Util.element_to_text(element)
|
16
16
|
end
|
17
17
|
|
18
18
|
def uri
|
19
19
|
# lookup the root parent, and get connector info
|
20
20
|
host = Wiki::Api::Util.parent_root(self).connect.uri
|
21
|
-
href_value =
|
22
|
-
URI.parse
|
21
|
+
href_value = element.attributes['href'].value
|
22
|
+
URI.parse("#{host}#{href_value}")
|
23
23
|
end
|
24
24
|
|
25
25
|
def title
|
26
26
|
# skip links with no title
|
27
|
-
return
|
28
|
-
|
27
|
+
return '' if element.attributes['title'].nil?
|
28
|
+
|
29
|
+
element.attributes['title'].value
|
29
30
|
end
|
30
31
|
|
31
32
|
def html
|
32
|
-
"<a href=\"#{
|
33
|
+
"<a href=\"#{uri}\" alt=\"#{title}\">#{title}</a>"
|
33
34
|
end
|
34
|
-
|
35
35
|
end
|
36
|
-
|
37
36
|
end
|
38
|
-
end
|
37
|
+
end
|
@@ -1,34 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
# List Items on a Page (li=xxx)
|
5
6
|
class PageListItem
|
6
|
-
|
7
7
|
attr_accessor :element, :parent
|
8
8
|
|
9
|
-
def initialize
|
10
|
-
self.
|
11
|
-
self.
|
9
|
+
def initialize(options = {})
|
10
|
+
self.parent = options[:parent] if options.include?(:parent)
|
11
|
+
self.element = options[:element] if options.include?(:element)
|
12
12
|
end
|
13
13
|
|
14
14
|
def to_text
|
15
|
-
Wiki::Api::Util.element_to_text
|
15
|
+
Wiki::Api::Util.element_to_text(element)
|
16
16
|
end
|
17
17
|
|
18
18
|
def links
|
19
|
-
|
20
|
-
PageLink.new
|
19
|
+
search('a').map do |a|
|
20
|
+
PageLink.new(parent: self, element: a)
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
24
|
protected
|
25
25
|
|
26
26
|
def search *paths
|
27
|
-
|
27
|
+
element.search(*paths)
|
28
28
|
end
|
29
|
-
|
30
|
-
|
31
29
|
end
|
32
|
-
|
33
30
|
end
|
34
|
-
end
|
31
|
+
end
|
data/lib/wiki/api/util.rb
CHANGED
@@ -1,46 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
class Util
|
5
|
-
|
6
6
|
class << self
|
7
|
+
def element_to_text(element)
|
8
|
+
raise('not an element') unless element.is_a?(Nokogiri::XML::Element)
|
7
9
|
|
8
|
-
|
9
|
-
raise "not an element" unless element.is_a? Nokogiri::XML::Element
|
10
|
-
self.clean_text element.text
|
10
|
+
clean_text(element.text)
|
11
11
|
end
|
12
12
|
|
13
|
-
def element_filter_lists
|
14
|
-
raise
|
13
|
+
def element_filter_lists(element)
|
14
|
+
raise('not an element') unless element.is_a?(Nokogiri::XML::Element)
|
15
|
+
|
15
16
|
result = {}
|
16
|
-
element.search(
|
17
|
+
element.search('li').each_with_index do |li, i|
|
17
18
|
li.children.each do |child|
|
18
19
|
result[i] ||= []
|
19
|
-
result[i] <<
|
20
|
+
result[i] << clean_text(child.text)
|
20
21
|
end
|
21
22
|
end
|
22
|
-
result.map{ |
|
23
|
+
result.map { |_k, v| v.join('') }
|
23
24
|
end
|
24
25
|
|
25
|
-
def parent_root
|
26
|
+
def parent_root(current_object)
|
26
27
|
current = current_object
|
27
|
-
|
28
|
+
loop do
|
28
29
|
break if current.parent.nil?
|
30
|
+
|
29
31
|
current = current.parent
|
30
32
|
end
|
31
33
|
current
|
32
34
|
end
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
36
|
protected
|
38
|
-
def clean_text text
|
39
|
-
text.gsub(/\n/, " ").squeeze(" ").gsub(/\s(\W)/, '\1').gsub(/(\W)\s/, '\1 ').strip
|
40
|
-
end
|
41
37
|
|
38
|
+
def clean_text(text)
|
39
|
+
text.gsub(/\n/, ' ').squeeze(' ').gsub(/\s(\W)/, '\1').gsub(/(\W)\s/, '\1 ').strip
|
40
|
+
end
|
42
41
|
end
|
43
|
-
|
44
42
|
end
|
45
43
|
end
|
46
|
-
end
|
44
|
+
end
|
data/lib/wiki/api/version.rb
CHANGED
data/lib/wiki/api.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
-
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/connect")
|
3
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page")
|
4
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_headline")
|
5
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_block")
|
6
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_list_item")
|
7
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_link")
|
8
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/util")
|
1
|
+
# frozen_string_literal: true
|
9
2
|
|
3
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/version")
|
4
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/connect")
|
5
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page")
|
6
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_headline")
|
7
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_block")
|
8
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_list_item")
|
9
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_link")
|
10
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/util")
|
10
11
|
|
11
12
|
module Wiki
|
12
13
|
module Api
|
data/test/test_helper.rb
CHANGED
data/test/unit/wiki_connect.rb
CHANGED
@@ -1,51 +1,44 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'pry'
|
4
5
|
|
5
6
|
#
|
6
7
|
# Testing the connection to https://www.mediawiki.org/wiki/API:Main_page
|
7
8
|
#
|
8
9
|
|
9
10
|
class WikiConnect < Test::Unit::TestCase
|
10
|
-
|
11
|
-
CONFIG = { uri: "http://en.wiktionary.org" }
|
11
|
+
CONFIG = { uri: 'https://en.wiktionary.org' }.freeze
|
12
12
|
|
13
13
|
def setup
|
14
14
|
Wiki::Api::Connect.config = CONFIG
|
15
15
|
end
|
16
16
|
|
17
|
-
def teardown
|
18
|
-
end
|
17
|
+
def teardown; end
|
19
18
|
|
20
19
|
def test_connection_wiktionary
|
21
|
-
c = Wiki::Api::Connect.new
|
20
|
+
c = Wiki::Api::Connect.new(uri: 'http://en.wiktionary.org')
|
22
21
|
ret = c.connect
|
23
|
-
assert
|
22
|
+
assert(ret.is_a?(Net::HTTPMovedPermanently), 'invalid response http')
|
24
23
|
end
|
25
24
|
|
26
25
|
def test_connection_https_wiktionary
|
27
|
-
c = Wiki::Api::Connect.new
|
26
|
+
c = Wiki::Api::Connect.new(uri: 'https://en.wiktionary.org')
|
28
27
|
ret = c.connect
|
29
|
-
assert
|
28
|
+
assert(ret.is_a?(Net::HTTPOK), 'invalid response https')
|
30
29
|
end
|
31
30
|
|
32
31
|
def test_page_get
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
assert false, "expected valid page #{e.message}"
|
38
|
-
end
|
32
|
+
c = Wiki::Api::Connect.new
|
33
|
+
c.page('Wiktionary:Welcome,_newcomers')
|
34
|
+
rescue Exception => e
|
35
|
+
assert(false, "expected valid page #{e.message}")
|
39
36
|
end
|
40
37
|
|
41
38
|
def test_page_get_non_exist
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
assert (e.message == "missingtitle"), "expected invalid page #{e.message}"
|
47
|
-
end
|
39
|
+
c = Wiki::Api::Connect.new
|
40
|
+
c.page('asfsldkfjjlkanv98yhok')
|
41
|
+
rescue Exception => e
|
42
|
+
assert((e.message == 'missingtitle'), "expected invalid page #{e.message}")
|
48
43
|
end
|
49
|
-
|
50
|
-
|
51
44
|
end
|