wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
@@ -5,7 +5,7 @@
|
|
5
5
|
# Base.
|
6
6
|
Wgit::Document.define_extractor(
|
7
7
|
:base,
|
8
|
-
|
8
|
+
"//base/@href",
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
@@ -15,7 +15,7 @@ end
|
|
15
15
|
# Title.
|
16
16
|
Wgit::Document.define_extractor(
|
17
17
|
:title,
|
18
|
-
|
18
|
+
"//title",
|
19
19
|
singleton: true,
|
20
20
|
text_content_only: true
|
21
21
|
)
|
@@ -43,17 +43,18 @@ Wgit::Document.define_extractor(
|
|
43
43
|
singleton: true,
|
44
44
|
text_content_only: true
|
45
45
|
) do |keywords, _source, type|
|
46
|
-
if keywords &&
|
47
|
-
keywords = keywords.split(
|
48
|
-
Wgit::Utils.sanitize(keywords)
|
46
|
+
if keywords && type == :document
|
47
|
+
keywords = keywords.split(",")
|
48
|
+
keywords = Wgit::Utils.sanitize(keywords)
|
49
49
|
end
|
50
|
+
|
50
51
|
keywords
|
51
52
|
end
|
52
53
|
|
53
54
|
# Links.
|
54
55
|
Wgit::Document.define_extractor(
|
55
56
|
:links,
|
56
|
-
|
57
|
+
"//a/@href",
|
57
58
|
singleton: false,
|
58
59
|
text_content_only: true
|
59
60
|
) do |links|
|
@@ -65,7 +66,12 @@ end
|
|
65
66
|
# Text.
|
66
67
|
Wgit::Document.define_extractor(
|
67
68
|
:text,
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
)
|
69
|
+
nil # doc.parser contains all HTML so omit the xpath search.
|
70
|
+
) do |text, doc, type|
|
71
|
+
if type == :document
|
72
|
+
html_to_text = Wgit::HTMLToText.new(doc.parser)
|
73
|
+
text = html_to_text.extract
|
74
|
+
end
|
75
|
+
|
76
|
+
text
|
77
|
+
end
|
data/lib/wgit/dsl.rb
CHANGED
@@ -44,14 +44,14 @@ the 'start' function".freeze
|
|
44
44
|
Wgit::Document.define_extractor(var, xpath, opts, &block)
|
45
45
|
end
|
46
46
|
|
47
|
-
#
|
48
|
-
#
|
49
|
-
# more details.
|
47
|
+
# Sets and returns the Wgit::Crawler used in subsequent crawls including
|
48
|
+
# indexing. Defaults to `Wgit::Crawler.new` if not given a param. See the
|
49
|
+
# Wgit::Crawler documentation for more details.
|
50
50
|
#
|
51
|
-
# @yield [crawler]
|
52
|
-
# @return [Wgit::Crawler] The
|
53
|
-
def crawler
|
54
|
-
@dsl_crawler
|
51
|
+
# @yield [crawler] Given the DSL crawler; use the block to configure.
|
52
|
+
# @return [Wgit::Crawler] The crawler instance used by the DSL.
|
53
|
+
def use_crawler(crawler = nil)
|
54
|
+
@dsl_crawler = crawler || @dsl_crawler || Wgit::Crawler.new
|
55
55
|
yield @dsl_crawler if block_given?
|
56
56
|
@dsl_crawler
|
57
57
|
end
|
@@ -66,7 +66,7 @@ the 'start' function".freeze
|
|
66
66
|
# @yield [crawler] The crawler that'll be used in the subsequent
|
67
67
|
# crawl/index; use the block to configure.
|
68
68
|
def start(*urls, &block)
|
69
|
-
|
69
|
+
use_crawler(&block) if block_given?
|
70
70
|
@dsl_start = urls
|
71
71
|
end
|
72
72
|
|
@@ -101,7 +101,7 @@ the 'start' function".freeze
|
|
101
101
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
102
|
|
103
103
|
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
-
|
104
|
+
get_crawler.crawl_urls(*urls, follow_redirects:, &block)
|
105
105
|
end
|
106
106
|
|
107
107
|
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
@@ -135,47 +135,44 @@ the 'start' function".freeze
|
|
135
135
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
136
|
|
137
137
|
xpath = follow || :default
|
138
|
-
opts = {
|
139
|
-
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
-
}
|
138
|
+
opts = { follow: xpath, allow_paths:, disallow_paths: }
|
141
139
|
|
142
140
|
urls.reduce([]) do |externals, url|
|
143
|
-
externals +
|
141
|
+
externals + get_crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
144
142
|
end
|
145
143
|
end
|
146
144
|
|
147
|
-
# Returns the DSL's `
|
145
|
+
# Returns the DSL's `Wgit::Crawler#last_response`.
|
148
146
|
#
|
149
147
|
# @return [Wgit::Response] The response from the last URL crawled.
|
150
148
|
def last_response
|
151
|
-
|
149
|
+
get_crawler.last_response
|
152
150
|
end
|
153
151
|
|
154
152
|
# Nilifies the DSL instance variables.
|
155
153
|
def reset
|
156
|
-
@dsl_crawler
|
157
|
-
@dsl_start
|
158
|
-
@dsl_follow
|
159
|
-
@
|
154
|
+
@dsl_crawler = nil
|
155
|
+
@dsl_start = nil
|
156
|
+
@dsl_follow = nil
|
157
|
+
@dsl_db = nil
|
160
158
|
end
|
161
159
|
|
162
160
|
### INDEXER METHODS ###
|
163
161
|
|
164
|
-
# Defines the
|
165
|
-
# method calls. This method is optional as
|
166
|
-
#
|
162
|
+
# Defines the connected database instance used in subsequent index and DB
|
163
|
+
# method calls. This method is optional however, as a new instance of the
|
164
|
+
# Wgit::Database.adapter_class will be initialised otherwise. Therefore
|
165
|
+
# if not calling this method, you should ensure
|
166
|
+
# ENV['WGIT_CONNECTION_STRING'] is set or the connection will fail.
|
167
167
|
#
|
168
|
-
# @param
|
169
|
-
#
|
170
|
-
def
|
171
|
-
@
|
168
|
+
# @param db [Wgit::Database::DatabaseAdapter] The connected database
|
169
|
+
# instance used in subsequent `index*` method calls.
|
170
|
+
def use_database(db)
|
171
|
+
@dsl_db = db
|
172
172
|
end
|
173
173
|
|
174
174
|
# Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
|
175
175
|
#
|
176
|
-
# @param connection_string [String] The database connection string. Set as
|
177
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
178
|
-
# `connection_string`.
|
179
176
|
# @param max_sites [Integer] The number of separate and whole
|
180
177
|
# websites to be crawled before the method exits. Defaults to -1 which
|
181
178
|
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
@@ -183,22 +180,16 @@ the 'start' function".freeze
|
|
183
180
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
184
181
|
# determine when to stop crawling; it's not a guarantee of the max data
|
185
182
|
# that will be obtained.
|
186
|
-
def index_www(
|
187
|
-
|
188
|
-
)
|
189
|
-
db = Wgit::Database.new(connection_string)
|
190
|
-
indexer = Wgit::Indexer.new(db, crawler)
|
183
|
+
def index_www(max_sites: -1, max_data: 1_048_576_000)
|
184
|
+
indexer = Wgit::Indexer.new(get_db, get_crawler)
|
191
185
|
|
192
|
-
indexer.index_www(max_sites
|
186
|
+
indexer.index_www(max_sites:, max_data:)
|
193
187
|
end
|
194
188
|
|
195
189
|
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
196
190
|
#
|
197
191
|
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
|
198
192
|
# crawl. Can be set using `start`.
|
199
|
-
# @param connection_string [String] The database connection string. Set as
|
200
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
201
|
-
# `connection_string`.
|
202
193
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
203
194
|
# external URL's into the database.
|
204
195
|
# @param follow [String] The xpath extracting links to be followed during
|
@@ -215,19 +206,16 @@ the 'start' function".freeze
|
|
215
206
|
# set.
|
216
207
|
# @return [Integer] The total number of pages crawled within the website.
|
217
208
|
def index_site(
|
218
|
-
*urls,
|
219
|
-
insert_externals: false, follow: @dsl_follow,
|
209
|
+
*urls, insert_externals: false, follow: @dsl_follow,
|
220
210
|
allow_paths: nil, disallow_paths: nil, &block
|
221
211
|
)
|
222
212
|
urls = (@dsl_start || []) if urls.empty?
|
223
213
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
224
214
|
|
225
|
-
|
226
|
-
indexer = Wgit::Indexer.new(db, crawler)
|
215
|
+
indexer = Wgit::Indexer.new(get_db, get_crawler)
|
227
216
|
xpath = follow || :default
|
228
217
|
crawl_opts = {
|
229
|
-
insert_externals
|
230
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths
|
218
|
+
insert_externals:, follow: xpath, allow_paths:, disallow_paths:
|
231
219
|
}
|
232
220
|
|
233
221
|
urls.reduce(0) do |total, url|
|
@@ -239,9 +227,6 @@ the 'start' function".freeze
|
|
239
227
|
#
|
240
228
|
# @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
|
241
229
|
# `start` URL(s).
|
242
|
-
# @param connection_string [String] The database connection string. Set as
|
243
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
244
|
-
# `connection_string`.
|
245
230
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
246
231
|
# external URL's into the database.
|
247
232
|
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
@@ -250,28 +235,24 @@ the 'start' function".freeze
|
|
250
235
|
# document from being saved into the database.
|
251
236
|
# @raise [StandardError] If no urls are provided and no `start` URL has
|
252
237
|
# been set.
|
253
|
-
def index(
|
254
|
-
*urls, connection_string: @dsl_conn_str,
|
255
|
-
insert_externals: false, &block
|
256
|
-
)
|
238
|
+
def index(*urls, insert_externals: false, &block)
|
257
239
|
urls = (@dsl_start || []) if urls.empty?
|
258
240
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
259
241
|
|
260
|
-
|
261
|
-
indexer = Wgit::Indexer.new(db, crawler)
|
242
|
+
indexer = Wgit::Indexer.new(get_db, get_crawler)
|
262
243
|
|
263
244
|
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
-
indexer.index_urls(*urls, insert_externals
|
245
|
+
indexer.index_urls(*urls, insert_externals:, &block)
|
265
246
|
end
|
266
247
|
|
248
|
+
### DATABASE METHODS ###
|
249
|
+
|
267
250
|
# Performs a search of the database's indexed documents and pretty prints
|
268
|
-
# the results in a search engine-esque format. See
|
269
|
-
# and `Wgit::Document#search!`
|
251
|
+
# the results in a search engine-esque format. See
|
252
|
+
# `Wgit::Database::DatabaseAdapter#search` and `Wgit::Document#search!`
|
253
|
+
# for details of how the search methods work.
|
270
254
|
#
|
271
255
|
# @param query [String] The text query to search with.
|
272
|
-
# @param connection_string [String] The database connection string. Set as
|
273
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
274
|
-
# `connection_string`.
|
275
256
|
# @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
|
276
257
|
# to output text somewhere e.g. a file or STDERR. Use nil for no output.
|
277
258
|
# @param case_sensitive [Boolean] Whether character case must match.
|
@@ -285,41 +266,53 @@ the 'start' function".freeze
|
|
285
266
|
# database containing only its matching `#text`.
|
286
267
|
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
268
|
def search(
|
288
|
-
query,
|
269
|
+
query, stream: $stdout,
|
270
|
+
top_result_only: true, include_score: false,
|
289
271
|
case_sensitive: false, whole_sentence: true,
|
290
|
-
limit: 10, skip: 0, sentence_limit: 80
|
272
|
+
limit: 10, skip: 0, sentence_limit: 80
|
291
273
|
)
|
292
274
|
stream ||= File.open(File::NULL, 'w')
|
293
|
-
db = Wgit::Database.new(connection_string)
|
294
275
|
|
295
|
-
results =
|
296
|
-
query,
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
)
|
276
|
+
results = get_db.search(
|
277
|
+
query, case_sensitive:, whole_sentence:, limit:, skip:)
|
278
|
+
|
279
|
+
results.each do |doc|
|
280
|
+
doc.search_text!(
|
281
|
+
query, case_sensitive:, whole_sentence:, sentence_limit:)
|
282
|
+
yield(doc) if block_given?
|
283
|
+
end
|
304
284
|
|
305
|
-
|
285
|
+
if top_result_only
|
286
|
+
Wgit::Utils.pprint_top_search_results(results, include_score:, stream:)
|
287
|
+
else
|
288
|
+
Wgit::Utils.pprint_all_search_results(results, include_score:, stream:)
|
289
|
+
end
|
306
290
|
|
307
291
|
results
|
308
292
|
end
|
309
293
|
|
310
294
|
# Deletes everything in the urls and documents collections by calling
|
311
|
-
# `Wgit::Database#
|
312
|
-
# so yeah... be careful.
|
295
|
+
# `Wgit::Database::DatabaseAdapter#empty` underneath.
|
313
296
|
#
|
314
297
|
# @return [Integer] The number of deleted records.
|
315
|
-
def
|
316
|
-
|
317
|
-
|
298
|
+
def empty_db!
|
299
|
+
get_db.empty
|
300
|
+
end
|
301
|
+
|
302
|
+
private
|
303
|
+
|
304
|
+
def get_crawler
|
305
|
+
@dsl_crawler ||= Wgit::Crawler.new
|
306
|
+
end
|
307
|
+
|
308
|
+
def get_db
|
309
|
+
@dsl_db ||= Wgit::Database.new
|
318
310
|
end
|
319
311
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
312
|
+
alias_method :crawl_url, :crawl
|
313
|
+
alias_method :crawl_r, :crawl_site
|
314
|
+
alias_method :index_r, :index_site
|
315
|
+
alias_method :index_url, :index
|
316
|
+
alias_method :start_urls, :start
|
324
317
|
end
|
325
318
|
end
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require_relative "utils"
|
2
|
+
require_relative "assertable"
|
3
|
+
require "nokogiri"
|
4
|
+
|
5
|
+
module Wgit
|
6
|
+
# Class used to extract the visible page text from a HTML string.
|
7
|
+
# This is in turn used to set the output of a Wgit::Document#text method.
|
8
|
+
class HTMLToText
|
9
|
+
include Assertable
|
10
|
+
|
11
|
+
# Set of text elements used to extract the visible text.
|
12
|
+
# The element's display (:inline or :block) is used to delimit sentences e.g.
|
13
|
+
# <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
|
14
|
+
# <span>foo</span><span>bar</span> will be extracted as ['foobar'].
|
15
|
+
@text_elements = {
|
16
|
+
a: :inline,
|
17
|
+
abbr: :inline,
|
18
|
+
address: :block,
|
19
|
+
article: :block,
|
20
|
+
aside: :block,
|
21
|
+
b: :inline,
|
22
|
+
bdi: :inline,
|
23
|
+
bdo: :inline,
|
24
|
+
blockquote: :block,
|
25
|
+
br: :block,
|
26
|
+
button: :block, # Normally inline but Wgit treats as block.
|
27
|
+
caption: :block,
|
28
|
+
cite: :inline,
|
29
|
+
code: :inline,
|
30
|
+
data: :inline,
|
31
|
+
dd: :block,
|
32
|
+
del: :inline,
|
33
|
+
details: :block,
|
34
|
+
dfn: :inline,
|
35
|
+
div: :block,
|
36
|
+
dl: :block,
|
37
|
+
dt: :block,
|
38
|
+
em: :inline,
|
39
|
+
figcaption: :block,
|
40
|
+
figure: :block,
|
41
|
+
footer: :block,
|
42
|
+
h1: :block,
|
43
|
+
h2: :block,
|
44
|
+
h3: :block,
|
45
|
+
h4: :block,
|
46
|
+
h5: :block,
|
47
|
+
h6: :block,
|
48
|
+
header: :block,
|
49
|
+
hr: :block,
|
50
|
+
i: :inline,
|
51
|
+
input: :inline,
|
52
|
+
ins: :block,
|
53
|
+
kbd: :inline,
|
54
|
+
label: :inline,
|
55
|
+
legend: :block,
|
56
|
+
li: :block,
|
57
|
+
main: :block,
|
58
|
+
mark: :inline,
|
59
|
+
meter: :block,
|
60
|
+
ol: :block,
|
61
|
+
option: :block,
|
62
|
+
output: :block,
|
63
|
+
p: :block,
|
64
|
+
pre: :block,
|
65
|
+
q: :inline,
|
66
|
+
rb: :inline,
|
67
|
+
rt: :inline,
|
68
|
+
ruby: :inline,
|
69
|
+
s: :inline,
|
70
|
+
samp: :inline,
|
71
|
+
section: :block,
|
72
|
+
small: :inline,
|
73
|
+
span: :inline,
|
74
|
+
strong: :inline,
|
75
|
+
sub: :inline,
|
76
|
+
summary: :block,
|
77
|
+
sup: :inline,
|
78
|
+
td: :block,
|
79
|
+
textarea: :block,
|
80
|
+
th: :block,
|
81
|
+
time: :inline,
|
82
|
+
u: :inline,
|
83
|
+
ul: :block,
|
84
|
+
var: :inline,
|
85
|
+
wbr: :inline
|
86
|
+
}
|
87
|
+
|
88
|
+
class << self
|
89
|
+
# Set of HTML elements that make up the visible text on a page. These
|
90
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
91
|
+
# README.md for how to add to this Hash dynamically.
|
92
|
+
attr_reader :text_elements
|
93
|
+
end
|
94
|
+
|
95
|
+
# The Nokogiri::HTML document object initialized from a HTML string.
|
96
|
+
attr_reader :parser
|
97
|
+
|
98
|
+
# Creates a new HTML to text extractor instance.
|
99
|
+
#
|
100
|
+
# @param parser [Nokogiri::HTML4::Document] The nokogiri parser object.
|
101
|
+
# @raise [StandardError] If the given parser is of an invalid type.
|
102
|
+
def initialize(parser)
|
103
|
+
assert_type(parser, Nokogiri::HTML4::Document)
|
104
|
+
|
105
|
+
@parser = parser
|
106
|
+
end
|
107
|
+
|
108
|
+
# Extracts and returns the text sentences from the @parser HTML.
|
109
|
+
#
|
110
|
+
# @return [Array<String>] An array of unique text sentences.
|
111
|
+
def extract_arr
|
112
|
+
return [] if @parser.to_s.empty?
|
113
|
+
|
114
|
+
text_str = extract_str
|
115
|
+
|
116
|
+
# Split the text_str into an Array of text sentences.
|
117
|
+
text_str
|
118
|
+
.split("\n")
|
119
|
+
.map(&:strip)
|
120
|
+
.reject(&:empty?)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Extracts and returns a text string from the @parser HTML.
|
124
|
+
#
|
125
|
+
# @return [String] A string of text with \n delimiting sentences.
|
126
|
+
def extract_str
|
127
|
+
text_str = ""
|
128
|
+
|
129
|
+
iterate_child_nodes(@parser) do |node, display|
|
130
|
+
# Handle any special cases e.g. skip nodes we don't care about...
|
131
|
+
# <pre> nodes should have their contents displayed exactly as is.
|
132
|
+
if node_name(node) == :pre
|
133
|
+
text_str << "\n"
|
134
|
+
text_str << node.text
|
135
|
+
next
|
136
|
+
end
|
137
|
+
|
138
|
+
# Skip any child node of <pre> since they're handled as a special case above.
|
139
|
+
next if child_of?(:pre, node)
|
140
|
+
|
141
|
+
if node.text?
|
142
|
+
# Skip any text element that is purely whitespace.
|
143
|
+
next unless valid_text_content?(node.text)
|
144
|
+
else
|
145
|
+
# Skip a concrete node if it has other concrete child nodes as these
|
146
|
+
# will be iterated onto later.
|
147
|
+
#
|
148
|
+
# Process if node has no children or one child which is a valid text node.
|
149
|
+
next unless node.children.empty? || parent_of_text_node_only?(node)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Apply display rules deciding if a new line is needed before node.text.
|
153
|
+
add_new_line = false
|
154
|
+
prev = prev_sibling_or_parent(node)
|
155
|
+
|
156
|
+
if node.text?
|
157
|
+
add_new_line = true unless prev && inline?(prev)
|
158
|
+
else
|
159
|
+
add_new_line = true if display == :block
|
160
|
+
add_new_line = true if prev && block?(prev)
|
161
|
+
end
|
162
|
+
|
163
|
+
text_str << "\n" if add_new_line
|
164
|
+
text_str << format_text(node.text)
|
165
|
+
end
|
166
|
+
|
167
|
+
text_str
|
168
|
+
.strip
|
169
|
+
.squeeze("\n")
|
170
|
+
.squeeze(" ")
|
171
|
+
end
|
172
|
+
|
173
|
+
private
|
174
|
+
|
175
|
+
def node_name(node)
|
176
|
+
node.name&.downcase&.to_sym
|
177
|
+
end
|
178
|
+
|
179
|
+
def display(node)
|
180
|
+
name = node_name(node)
|
181
|
+
Wgit::HTMLToText.text_elements[name]
|
182
|
+
end
|
183
|
+
|
184
|
+
def inline?(node)
|
185
|
+
display(node) == :inline
|
186
|
+
end
|
187
|
+
|
188
|
+
def block?(node)
|
189
|
+
display(node) == :block
|
190
|
+
end
|
191
|
+
|
192
|
+
# Returns the previous sibling of node or nil. Only valid text elements are
|
193
|
+
# returned i.e. non duplicates with valid text content.
|
194
|
+
def prev_sibling(node)
|
195
|
+
prev = node.previous
|
196
|
+
|
197
|
+
return nil unless prev
|
198
|
+
return prev unless prev.text?
|
199
|
+
return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
|
200
|
+
return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
|
201
|
+
|
202
|
+
prev.previous
|
203
|
+
end
|
204
|
+
|
205
|
+
# Returns node's previous sibling, parent or nil; in that order. Only valid
|
206
|
+
# text elements are returned i.e. non duplicates with valid text content.
|
207
|
+
def prev_sibling_or_parent(node)
|
208
|
+
prev = prev_sibling(node)
|
209
|
+
return prev if prev
|
210
|
+
|
211
|
+
node.parent
|
212
|
+
end
|
213
|
+
|
214
|
+
def child_of?(ancestor_name, node)
|
215
|
+
node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns true if any of the child nodes contain a non empty :text node.
|
219
|
+
def parent_of_text_node?(node)
|
220
|
+
node.children.any? { |child| child.text? && valid_text_content?(child.text) }
|
221
|
+
end
|
222
|
+
|
223
|
+
def parent_of_text_node_only?(node)
|
224
|
+
node.children.size == 1 && parent_of_text_node?(node)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Returns true if text is not empty having removed all new lines.
|
228
|
+
def valid_text_content?(text)
|
229
|
+
!format_text(text).empty?
|
230
|
+
end
|
231
|
+
|
232
|
+
# Returns true if node is a text node.
|
233
|
+
# Duplicate text nodes (that follow a concrete node) are omitted.
|
234
|
+
def valid_text_node?(node)
|
235
|
+
node.text? && node.text != node.parent.text
|
236
|
+
end
|
237
|
+
|
238
|
+
def contains_new_line?(text)
|
239
|
+
["\n", '\\n'].any? { |new_line| text.include?(new_line) }
|
240
|
+
end
|
241
|
+
|
242
|
+
# Remove special characters including any new lines; as semantic HTML will
|
243
|
+
# typically use <br> and/or block elements to denote a line break.
|
244
|
+
def format_text(text)
|
245
|
+
text
|
246
|
+
.encode("UTF-8", undef: :replace, invalid: :replace)
|
247
|
+
.gsub("\n", "")
|
248
|
+
.gsub('\\n', "")
|
249
|
+
.gsub("\r", "")
|
250
|
+
.gsub('\\r', "")
|
251
|
+
.gsub("\f", "")
|
252
|
+
.gsub('\\f', "")
|
253
|
+
.gsub("\t", "")
|
254
|
+
.gsub('\\t', "")
|
255
|
+
.gsub("‌", "")
|
256
|
+
.gsub(" ", " ")
|
257
|
+
.gsub(" ", " ")
|
258
|
+
.gsub(" ", " ")
|
259
|
+
.gsub(" ", " ")
|
260
|
+
.gsub(" ", " ")
|
261
|
+
.gsub('\u00a0', " ")
|
262
|
+
end
|
263
|
+
|
264
|
+
# Iterate over node and it's child nodes, yielding each to &block.
|
265
|
+
# Only HTMLToText.text_elements or valid :text nodes will be yielded.
|
266
|
+
# Duplicate text nodes (that follow a concrete node) are omitted.
|
267
|
+
def iterate_child_nodes(node, &block)
|
268
|
+
display = display(node)
|
269
|
+
text_node = valid_text_node?(node)
|
270
|
+
|
271
|
+
yield(node, display) if display || text_node
|
272
|
+
node.children.each { |child| iterate_child_nodes(child, &block) }
|
273
|
+
end
|
274
|
+
|
275
|
+
alias_method :extract, :extract_arr
|
276
|
+
end
|
277
|
+
end
|