wiki-api 0.0.2 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/.rubocop.yml +24 -0
- data/.travis.yml +12 -0
- data/Gemfile +2 -0
- data/README.md +93 -64
- data/Rakefile +13 -1
- data/bin/console +8 -0
- data/lib/wiki/api/connect.rb +52 -28
- data/lib/wiki/api/page.rb +48 -82
- data/lib/wiki/api/page_block.rb +19 -18
- data/lib/wiki/api/page_headline.rb +104 -8
- data/lib/wiki/api/page_link.rb +18 -14
- data/lib/wiki/api/page_list_item.rb +12 -13
- data/lib/wiki/api/util.rb +24 -15
- data/lib/wiki/api/version.rb +3 -1
- data/lib/wiki/api.rb +9 -8
- data/test/test_helper.rb +4 -7
- data/test/unit/files/Wiktionary_program.html +4232 -0
- data/test/unit/wiki_connect.rb +18 -25
- data/test/unit/wiki_page_offline.rb +295 -0
- data/wiki-api.gemspec +20 -17
- metadata +57 -38
- data/test/unit/wiki_page_config.rb +0 -45
- data/test/unit/wiki_page_object.rb +0 -229
data/lib/wiki/api/page_block.rb
CHANGED
@@ -1,25 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
5
|
+
# Collection of elements for segmented per headline
|
4
6
|
class PageBlock
|
7
|
+
attr_accessor :elements, :parent
|
5
8
|
|
6
|
-
|
7
|
-
|
8
|
-
def initialize options={}
|
9
|
+
def initialize(options = {})
|
10
|
+
self.parent = options[:parent] if options.include?(:parent)
|
9
11
|
self.elements = []
|
10
12
|
end
|
11
13
|
|
12
|
-
def <<
|
13
|
-
|
14
|
+
def <<(value)
|
15
|
+
# value.first.previous.name
|
16
|
+
elements << value
|
14
17
|
end
|
15
18
|
|
16
19
|
def to_texts
|
17
|
-
# TODO: perhaps we should wrap the elements with objects??
|
18
20
|
texts = []
|
19
|
-
|
20
|
-
text = Wiki::Api::Util.element_to_text
|
21
|
+
elements.flatten.each do |element|
|
22
|
+
text = Wiki::Api::Util.element_to_text(element) if element.is_a?(Nokogiri::XML::Element)
|
21
23
|
next if text.nil?
|
22
24
|
next if text.empty?
|
25
|
+
|
23
26
|
texts << text
|
24
27
|
end
|
25
28
|
texts
|
@@ -27,27 +30,25 @@ module Wiki
|
|
27
30
|
|
28
31
|
def list_items
|
29
32
|
# TODO: perhaps we should wrap the elements with objects, and request a li per element??
|
30
|
-
|
31
|
-
PageListItem.new element: list_item
|
33
|
+
search('li').map do |list_item|
|
34
|
+
PageListItem.new(parent: self, element: list_item)
|
32
35
|
end
|
33
36
|
end
|
34
37
|
|
35
38
|
def links
|
36
39
|
# TODO: perhaps we should wrap the elements with objects, and request a li per element??
|
37
|
-
|
38
|
-
PageLink.new element: a
|
40
|
+
search('a').map do |a|
|
41
|
+
PageLink.new(parent: self, element: a)
|
39
42
|
end
|
40
43
|
end
|
41
44
|
|
42
45
|
protected
|
43
46
|
|
44
47
|
def search *paths
|
45
|
-
|
48
|
+
elements.flatten.flat_map do |element|
|
46
49
|
element.search(*paths)
|
47
|
-
end.reject
|
50
|
+
end.reject(&:nil?)
|
48
51
|
end
|
49
|
-
|
50
52
|
end
|
51
|
-
|
52
53
|
end
|
53
|
-
end
|
54
|
+
end
|
@@ -1,22 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
5
|
+
# Headline for a page (class="mw-healine")
|
4
6
|
class PageHeadline
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
LEVEL = %w[text h1 h2 h3 h4 h5 h6].freeze
|
10
|
+
|
11
|
+
attr_accessor :name, :block, :parent, :headlines, :level
|
12
|
+
|
13
|
+
def initialize(options = {})
|
14
|
+
self.name = options[:name] if options.include?(:name)
|
15
|
+
self.parent = options[:parent] if options.include?(:parent)
|
16
|
+
self.level = options[:level] if options.include?(:level)
|
17
|
+
options[:headlines] ||= []
|
18
|
+
self.headlines ||= {}
|
19
|
+
|
20
|
+
# store elements in a block
|
21
|
+
self.block = PageBlock.new(parent: self)
|
22
|
+
if options[:headlines].include?(name)
|
23
|
+
options[:headlines][name].each do |element|
|
24
|
+
block << element
|
25
|
+
end
|
26
|
+
end
|
5
27
|
|
6
|
-
|
28
|
+
# collect nested headlines
|
29
|
+
headlines = options[:headlines]
|
30
|
+
# remove self from list
|
31
|
+
headlines.delete(name)
|
32
|
+
nested_headlines = self.nested_headlines(headlines, name, level)
|
7
33
|
|
8
|
-
|
9
|
-
|
10
|
-
|
34
|
+
# iterate nested headlines, and call recursive
|
35
|
+
nested_headlines.each do |headline_name, value|
|
36
|
+
level = LEVEL.index(value.first.first.previous.name)
|
37
|
+
self.headlines[headline_name] =
|
38
|
+
PageHeadline.new(parent: self, name: headline_name, headlines:, level:)
|
39
|
+
end
|
11
40
|
end
|
12
41
|
|
13
42
|
def elements
|
14
|
-
|
43
|
+
block.elements
|
15
44
|
end
|
16
45
|
|
46
|
+
def type
|
47
|
+
block.elements.first.first.previous.name
|
48
|
+
end
|
17
49
|
|
50
|
+
# get headline by name
|
51
|
+
def headline(name)
|
52
|
+
name = name.downcase.gsub(' ', '_')
|
53
|
+
self.headlines.select do |k, _v|
|
54
|
+
k.downcase.start_with?(name)
|
55
|
+
end.values
|
56
|
+
end
|
18
57
|
|
19
|
-
|
58
|
+
def headline_in_depth(name, depth = 1)
|
59
|
+
name = name.downcase.gsub(' ', '_')
|
60
|
+
ret = []
|
61
|
+
|
62
|
+
self.headlines.each do |k, v|
|
63
|
+
ret << v if k.downcase.start_with?(name)
|
64
|
+
next if v.headlines.empty?
|
65
|
+
|
66
|
+
if depth.positive?
|
67
|
+
q = v.headline_in_depth(name, (depth - 1))
|
68
|
+
ret.concat(q)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
ret
|
72
|
+
end
|
73
|
+
|
74
|
+
# headline exists for current headline
|
75
|
+
def has_headline?(name)
|
76
|
+
name = name.downcase.gsub(' ', '_')
|
77
|
+
self.headlines.each_key do |k|
|
78
|
+
return true if k.downcase.start_with?(name)
|
79
|
+
end
|
80
|
+
false
|
81
|
+
end
|
20
82
|
|
83
|
+
def to_hash
|
84
|
+
ret = { name:, headlines: [], type: }
|
85
|
+
self.headlines.each_value do |headline|
|
86
|
+
ret[:headlines] << headline.to_hash
|
87
|
+
end
|
88
|
+
ret
|
89
|
+
end
|
90
|
+
|
91
|
+
def to_pretty_json
|
92
|
+
JSON.pretty_generate(to_hash)
|
93
|
+
end
|
94
|
+
|
95
|
+
protected
|
96
|
+
|
97
|
+
# filter nested headlines (elements) from a parent headline (by name)
|
98
|
+
def nested_headlines(headlines, _name, original_level)
|
99
|
+
ret = {}
|
100
|
+
init_level = nil
|
101
|
+
# iterate headlines, skip already done onces
|
102
|
+
# headlines.drop(headline_index + 1).each do |headline|
|
103
|
+
headlines.to_a.each do |name, value|
|
104
|
+
level = LEVEL.index(value.first.first.previous.name)
|
105
|
+
init_level ||= level
|
106
|
+
# lower level indicate nest end
|
107
|
+
break if level <= original_level
|
108
|
+
break if level < init_level
|
109
|
+
# higher level indicates nested items, these will be processed recursive
|
110
|
+
next if init_level != level
|
111
|
+
|
112
|
+
ret[name] = value
|
113
|
+
end
|
114
|
+
ret
|
115
|
+
end
|
116
|
+
end
|
21
117
|
end
|
22
|
-
end
|
118
|
+
end
|
data/lib/wiki/api/page_link.rb
CHANGED
@@ -1,33 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
5
|
+
# Link on a wiki page (a href=xxx)
|
4
6
|
class PageLink
|
7
|
+
attr_accessor :element, :parent
|
5
8
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
self.element = options[:element] if options.include? :element
|
9
|
+
def initialize(options = {})
|
10
|
+
self.element = options[:element] if options.include?(:element)
|
11
|
+
self.parent = options[:parent] if options.include?(:parent)
|
10
12
|
end
|
11
13
|
|
12
14
|
def to_text
|
13
|
-
Wiki::Api::Util.element_to_text
|
15
|
+
Wiki::Api::Util.element_to_text(element)
|
14
16
|
end
|
15
17
|
|
16
18
|
def uri
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
# lookup the root parent, and get connector info
|
20
|
+
host = Wiki::Api::Util.parent_root(self).connect.uri
|
21
|
+
href_value = element.attributes['href'].value
|
22
|
+
URI.parse("#{host}#{href_value}")
|
20
23
|
end
|
21
24
|
|
22
25
|
def title
|
23
|
-
|
26
|
+
# skip links with no title
|
27
|
+
return '' if element.attributes['title'].nil?
|
28
|
+
|
29
|
+
element.attributes['title'].value
|
24
30
|
end
|
25
31
|
|
26
32
|
def html
|
27
|
-
"<a href=\"#{
|
33
|
+
"<a href=\"#{uri}\" alt=\"#{title}\">#{title}</a>"
|
28
34
|
end
|
29
|
-
|
30
35
|
end
|
31
|
-
|
32
36
|
end
|
33
|
-
end
|
37
|
+
end
|
@@ -1,32 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
5
|
+
# List Items on a Page (li=xxx)
|
4
6
|
class PageListItem
|
7
|
+
attr_accessor :element, :parent
|
5
8
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
self.element = options[:element] if options.include? :element
|
9
|
+
def initialize(options = {})
|
10
|
+
self.parent = options[:parent] if options.include?(:parent)
|
11
|
+
self.element = options[:element] if options.include?(:element)
|
10
12
|
end
|
11
13
|
|
12
14
|
def to_text
|
13
|
-
Wiki::Api::Util.element_to_text
|
15
|
+
Wiki::Api::Util.element_to_text(element)
|
14
16
|
end
|
15
17
|
|
16
18
|
def links
|
17
|
-
|
18
|
-
PageLink.new element: a
|
19
|
+
search('a').map do |a|
|
20
|
+
PageLink.new(parent: self, element: a)
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
22
24
|
protected
|
23
25
|
|
24
26
|
def search *paths
|
25
|
-
|
27
|
+
element.search(*paths)
|
26
28
|
end
|
27
|
-
|
28
|
-
|
29
29
|
end
|
30
|
-
|
31
30
|
end
|
32
|
-
end
|
31
|
+
end
|
data/lib/wiki/api/util.rb
CHANGED
@@ -1,35 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Wiki
|
2
4
|
module Api
|
3
|
-
|
4
5
|
class Util
|
5
|
-
|
6
6
|
class << self
|
7
|
+
def element_to_text(element)
|
8
|
+
raise('not an element') unless element.is_a?(Nokogiri::XML::Element)
|
7
9
|
|
8
|
-
|
9
|
-
raise "not an element" unless element.is_a? Nokogiri::XML::Element
|
10
|
-
self.clean_text element.text
|
10
|
+
clean_text(element.text)
|
11
11
|
end
|
12
12
|
|
13
|
-
def element_filter_lists
|
14
|
-
raise
|
13
|
+
def element_filter_lists(element)
|
14
|
+
raise('not an element') unless element.is_a?(Nokogiri::XML::Element)
|
15
|
+
|
15
16
|
result = {}
|
16
|
-
element.search(
|
17
|
+
element.search('li').each_with_index do |li, i|
|
17
18
|
li.children.each do |child|
|
18
19
|
result[i] ||= []
|
19
|
-
result[i] <<
|
20
|
+
result[i] << clean_text(child.text)
|
20
21
|
end
|
21
22
|
end
|
22
|
-
result.map{|
|
23
|
+
result.map { |_k, v| v.join('') }
|
23
24
|
end
|
24
25
|
|
26
|
+
def parent_root(current_object)
|
27
|
+
current = current_object
|
28
|
+
loop do
|
29
|
+
break if current.parent.nil?
|
25
30
|
|
26
|
-
|
27
|
-
|
28
|
-
|
31
|
+
current = current.parent
|
32
|
+
end
|
33
|
+
current
|
29
34
|
end
|
30
35
|
|
31
|
-
|
36
|
+
protected
|
32
37
|
|
38
|
+
def clean_text(text)
|
39
|
+
text.gsub(/\n/, ' ').squeeze(' ').gsub(/\s(\W)/, '\1').gsub(/(\W)\s/, '\1 ').strip
|
40
|
+
end
|
41
|
+
end
|
33
42
|
end
|
34
43
|
end
|
35
|
-
end
|
44
|
+
end
|
data/lib/wiki/api/version.rb
CHANGED
data/lib/wiki/api.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
|
-
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/connect")
|
3
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page")
|
4
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_headline")
|
5
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_block")
|
6
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_list_item")
|
7
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/page_link")
|
8
|
-
require File.expand_path(File.dirname(__FILE__) + "/api/util")
|
1
|
+
# frozen_string_literal: true
|
9
2
|
|
3
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/version")
|
4
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/connect")
|
5
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page")
|
6
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_headline")
|
7
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_block")
|
8
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_list_item")
|
9
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/page_link")
|
10
|
+
require File.expand_path("#{File.dirname(__FILE__)}/api/util")
|
10
11
|
|
11
12
|
module Wiki
|
12
13
|
module Api
|
data/test/test_helper.rb
CHANGED