wgit 0.10.8 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  # Base.
6
6
  Wgit::Document.define_extractor(
7
7
  :base,
8
- '//base/@href',
8
+ "//base/@href",
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
@@ -15,7 +15,7 @@ end
15
15
  # Title.
16
16
  Wgit::Document.define_extractor(
17
17
  :title,
18
- '//title',
18
+ "//title",
19
19
  singleton: true,
20
20
  text_content_only: true
21
21
  )
@@ -43,17 +43,18 @@ Wgit::Document.define_extractor(
43
43
  singleton: true,
44
44
  text_content_only: true
45
45
  ) do |keywords, _source, type|
46
- if keywords && (type == :document)
47
- keywords = keywords.split(',')
48
- Wgit::Utils.sanitize(keywords)
46
+ if keywords && type == :document
47
+ keywords = keywords.split(",")
48
+ keywords = Wgit::Utils.sanitize(keywords)
49
49
  end
50
+
50
51
  keywords
51
52
  end
52
53
 
53
54
  # Links.
54
55
  Wgit::Document.define_extractor(
55
56
  :links,
56
- '//a/@href',
57
+ "//a/@href",
57
58
  singleton: false,
58
59
  text_content_only: true
59
60
  ) do |links|
@@ -65,7 +66,12 @@ end
65
66
  # Text.
66
67
  Wgit::Document.define_extractor(
67
68
  :text,
68
- proc { Wgit::Document.text_elements_xpath },
69
- singleton: false,
70
- text_content_only: true
71
- )
69
+ nil # doc.parser contains all HTML so omit the xpath search.
70
+ ) do |text, doc, type|
71
+ if type == :document
72
+ html_to_text = Wgit::HTMLToText.new(doc.parser)
73
+ text = html_to_text.extract
74
+ end
75
+
76
+ text
77
+ end
data/lib/wgit/dsl.rb CHANGED
@@ -44,14 +44,14 @@ the 'start' function".freeze
44
44
  Wgit::Document.define_extractor(var, xpath, opts, &block)
45
45
  end
46
46
 
47
- # Initializes a `Wgit::Crawler`. This crawler is then used in all crawl and
48
- # index methods used by the DSL. See the Wgit::Crawler documentation for
49
- # more details.
47
+ # Sets and returns the Wgit::Crawler used in subsequent crawls including
48
+ # indexing. Defaults to `Wgit::Crawler.new` if not given a param. See the
49
+ # Wgit::Crawler documentation for more details.
50
50
  #
51
- # @yield [crawler] The created crawler; use the block to configure.
52
- # @return [Wgit::Crawler] The created crawler used by the DSL.
53
- def crawler
54
- @dsl_crawler ||= Wgit::Crawler.new
51
+ # @yield [crawler] Given the DSL crawler; use the block to configure.
52
+ # @return [Wgit::Crawler] The crawler instance used by the DSL.
53
+ def use_crawler(crawler = nil)
54
+ @dsl_crawler = crawler || @dsl_crawler || Wgit::Crawler.new
55
55
  yield @dsl_crawler if block_given?
56
56
  @dsl_crawler
57
57
  end
@@ -66,7 +66,7 @@ the 'start' function".freeze
66
66
  # @yield [crawler] The crawler that'll be used in the subsequent
67
67
  # crawl/index; use the block to configure.
68
68
  def start(*urls, &block)
69
- crawler(&block)
69
+ use_crawler(&block) if block_given?
70
70
  @dsl_start = urls
71
71
  end
72
72
 
@@ -101,7 +101,7 @@ the 'start' function".freeze
101
101
  raise DSL_ERROR__NO_START_URL if urls.empty?
102
102
 
103
103
  urls.map! { |url| Wgit::Url.parse(url) }
104
- crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
104
+ get_crawler.crawl_urls(*urls, follow_redirects:, &block)
105
105
  end
106
106
 
107
107
  # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -135,47 +135,44 @@ the 'start' function".freeze
135
135
  raise DSL_ERROR__NO_START_URL if urls.empty?
136
136
 
137
137
  xpath = follow || :default
138
- opts = {
139
- follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
- }
138
+ opts = { follow: xpath, allow_paths:, disallow_paths: }
141
139
 
142
140
  urls.reduce([]) do |externals, url|
143
- externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
141
+ externals + get_crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
144
142
  end
145
143
  end
146
144
 
147
- # Returns the DSL's `crawler#last_response`.
145
+ # Returns the DSL's `Wgit::Crawler#last_response`.
148
146
  #
149
147
  # @return [Wgit::Response] The response from the last URL crawled.
150
148
  def last_response
151
- crawler.last_response
149
+ get_crawler.last_response
152
150
  end
153
151
 
154
152
  # Nilifies the DSL instance variables.
155
153
  def reset
156
- @dsl_crawler = nil
157
- @dsl_start = nil
158
- @dsl_follow = nil
159
- @dsl_conn_str = nil
154
+ @dsl_crawler = nil
155
+ @dsl_start = nil
156
+ @dsl_follow = nil
157
+ @dsl_db = nil
160
158
  end
161
159
 
162
160
  ### INDEXER METHODS ###
163
161
 
164
- # Defines the connection string to the database used in subsequent `index*`
165
- # method calls. This method is optional as the connection string can be
166
- # passed to the index method instead.
162
+ # Defines the connected database instance used in subsequent index and DB
163
+ # method calls. This method is optional however, as a new instance of the
164
+ # Wgit::Database.adapter_class will be initialised otherwise. Therefore
165
+ # if not calling this method, you should ensure
166
+ # ENV['WGIT_CONNECTION_STRING'] is set or the connection will fail.
167
167
  #
168
- # @param conn_str [String] The connection string used to connect to the
169
- # database in subsequent `index*` method calls.
170
- def connection_string(conn_str)
171
- @dsl_conn_str = conn_str
168
+ # @param db [Wgit::Database::DatabaseAdapter] The connected database
169
+ # instance used in subsequent `index*` method calls.
170
+ def use_database(db)
171
+ @dsl_db = db
172
172
  end
173
173
 
174
174
  # Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
175
175
  #
176
- # @param connection_string [String] The database connection string. Set as
177
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
178
- # `connection_string`.
179
176
  # @param max_sites [Integer] The number of separate and whole
180
177
  # websites to be crawled before the method exits. Defaults to -1 which
181
178
  # means the crawl will occur until manually stopped (Ctrl+C etc).
@@ -183,22 +180,16 @@ the 'start' function".freeze
183
180
  # scraped from the web (default is 1GB). Note, that this value is used to
184
181
  # determine when to stop crawling; it's not a guarantee of the max data
185
182
  # that will be obtained.
186
- def index_www(
187
- connection_string: @dsl_conn_str, max_sites: -1, max_data: 1_048_576_000
188
- )
189
- db = Wgit::Database.new(connection_string)
190
- indexer = Wgit::Indexer.new(db, crawler)
183
+ def index_www(max_sites: -1, max_data: 1_048_576_000)
184
+ indexer = Wgit::Indexer.new(get_db, get_crawler)
191
185
 
192
- indexer.index_www(max_sites: max_sites, max_data: max_data)
186
+ indexer.index_www(max_sites:, max_data:)
193
187
  end
194
188
 
195
189
  # Indexes a single website using `Wgit::Indexer#index_site` underneath.
196
190
  #
197
191
  # @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
198
192
  # crawl. Can be set using `start`.
199
- # @param connection_string [String] The database connection string. Set as
200
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
201
- # `connection_string`.
202
193
  # @param insert_externals [Boolean] Whether or not to insert the website's
203
194
  # external URL's into the database.
204
195
  # @param follow [String] The xpath extracting links to be followed during
@@ -215,19 +206,16 @@ the 'start' function".freeze
215
206
  # set.
216
207
  # @return [Integer] The total number of pages crawled within the website.
217
208
  def index_site(
218
- *urls, connection_string: @dsl_conn_str,
219
- insert_externals: false, follow: @dsl_follow,
209
+ *urls, insert_externals: false, follow: @dsl_follow,
220
210
  allow_paths: nil, disallow_paths: nil, &block
221
211
  )
222
212
  urls = (@dsl_start || []) if urls.empty?
223
213
  raise DSL_ERROR__NO_START_URL if urls.empty?
224
214
 
225
- db = Wgit::Database.new(connection_string)
226
- indexer = Wgit::Indexer.new(db, crawler)
215
+ indexer = Wgit::Indexer.new(get_db, get_crawler)
227
216
  xpath = follow || :default
228
217
  crawl_opts = {
229
- insert_externals: insert_externals, follow: xpath,
230
- allow_paths: allow_paths, disallow_paths: disallow_paths
218
+ insert_externals:, follow: xpath, allow_paths:, disallow_paths:
231
219
  }
232
220
 
233
221
  urls.reduce(0) do |total, url|
@@ -239,9 +227,6 @@ the 'start' function".freeze
239
227
  #
240
228
  # @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
241
229
  # `start` URL(s).
242
- # @param connection_string [String] The database connection string. Set as
243
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
244
- # `connection_string`.
245
230
  # @param insert_externals [Boolean] Whether or not to insert the website's
246
231
  # external URL's into the database.
247
232
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
@@ -250,28 +235,24 @@ the 'start' function".freeze
250
235
  # document from being saved into the database.
251
236
  # @raise [StandardError] If no urls are provided and no `start` URL has
252
237
  # been set.
253
- def index(
254
- *urls, connection_string: @dsl_conn_str,
255
- insert_externals: false, &block
256
- )
238
+ def index(*urls, insert_externals: false, &block)
257
239
  urls = (@dsl_start || []) if urls.empty?
258
240
  raise DSL_ERROR__NO_START_URL if urls.empty?
259
241
 
260
- db = Wgit::Database.new(connection_string)
261
- indexer = Wgit::Indexer.new(db, crawler)
242
+ indexer = Wgit::Indexer.new(get_db, get_crawler)
262
243
 
263
244
  urls.map! { |url| Wgit::Url.parse(url) }
264
- indexer.index_urls(*urls, insert_externals: insert_externals, &block)
245
+ indexer.index_urls(*urls, insert_externals:, &block)
265
246
  end
266
247
 
248
+ ### DATABASE METHODS ###
249
+
267
250
  # Performs a search of the database's indexed documents and pretty prints
268
- # the results in a search engine-esque format. See `Wgit::Database#search!`
269
- # and `Wgit::Document#search!` for details of how the search works.
251
+ # the results in a search engine-esque format. See
252
+ # `Wgit::Database::DatabaseAdapter#search` and `Wgit::Document#search!`
253
+ # for details of how the search methods work.
270
254
  #
271
255
  # @param query [String] The text query to search with.
272
- # @param connection_string [String] The database connection string. Set as
273
- # nil to use ENV['WGIT_CONNECTION_STRING'] or set using
274
- # `connection_string`.
275
256
  # @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
276
257
  # to output text somewhere e.g. a file or STDERR. Use nil for no output.
277
258
  # @param case_sensitive [Boolean] Whether character case must match.
@@ -285,41 +266,53 @@ the 'start' function".freeze
285
266
  # database containing only its matching `#text`.
286
267
  # @return [Array<Wgit::Document>] The search results with matching text.
287
268
  def search(
288
- query, connection_string: @dsl_conn_str, stream: STDOUT,
269
+ query, stream: $stdout,
270
+ top_result_only: true, include_score: false,
289
271
  case_sensitive: false, whole_sentence: true,
290
- limit: 10, skip: 0, sentence_limit: 80, &block
272
+ limit: 10, skip: 0, sentence_limit: 80
291
273
  )
292
274
  stream ||= File.open(File::NULL, 'w')
293
- db = Wgit::Database.new(connection_string)
294
275
 
295
- results = db.search!(
296
- query,
297
- case_sensitive: case_sensitive,
298
- whole_sentence: whole_sentence,
299
- limit: limit,
300
- skip: skip,
301
- sentence_limit: sentence_limit,
302
- &block
303
- )
276
+ results = get_db.search(
277
+ query, case_sensitive:, whole_sentence:, limit:, skip:)
278
+
279
+ results.each do |doc|
280
+ doc.search_text!(
281
+ query, case_sensitive:, whole_sentence:, sentence_limit:)
282
+ yield(doc) if block_given?
283
+ end
304
284
 
305
- Wgit::Utils.printf_search_results(results, stream: stream)
285
+ if top_result_only
286
+ Wgit::Utils.pprint_top_search_results(results, include_score:, stream:)
287
+ else
288
+ Wgit::Utils.pprint_all_search_results(results, include_score:, stream:)
289
+ end
306
290
 
307
291
  results
308
292
  end
309
293
 
310
294
  # Deletes everything in the urls and documents collections by calling
311
- # `Wgit::Database#clear_db` underneath. This will nuke the entire database
312
- # so yeah... be careful.
295
+ # `Wgit::Database::DatabaseAdapter#empty` underneath.
313
296
  #
314
297
  # @return [Integer] The number of deleted records.
315
- def clear_db!(connection_string: @dsl_conn_str)
316
- db = Wgit::Database.new(connection_string)
317
- db.clear_db
298
+ def empty_db!
299
+ get_db.empty
300
+ end
301
+
302
+ private
303
+
304
+ def get_crawler
305
+ @dsl_crawler ||= Wgit::Crawler.new
306
+ end
307
+
308
+ def get_db
309
+ @dsl_db ||= Wgit::Database.new
318
310
  end
319
311
 
320
- alias crawl_url crawl
321
- alias crawl_r crawl_site
322
- alias index_r index_site
323
- alias start_urls start
312
+ alias_method :crawl_url, :crawl
313
+ alias_method :crawl_r, :crawl_site
314
+ alias_method :index_r, :index_site
315
+ alias_method :index_url, :index
316
+ alias_method :start_urls, :start
324
317
  end
325
318
  end
@@ -0,0 +1,277 @@
1
+ require_relative "utils"
2
+ require_relative "assertable"
3
+ require "nokogiri"
4
+
5
+ module Wgit
6
+ # Class used to extract the visible page text from a HTML string.
7
+ # This is in turn used to set the output of a Wgit::Document#text method.
8
+ class HTMLToText
9
+ include Assertable
10
+
11
+ # Set of text elements used to extract the visible text.
12
+ # The element's display (:inline or :block) is used to delimit sentences e.g.
13
+ # <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
14
+ # <span>foo</span><span>bar</span> will be extracted as ['foobar'].
15
+ @text_elements = {
16
+ a: :inline,
17
+ abbr: :inline,
18
+ address: :block,
19
+ article: :block,
20
+ aside: :block,
21
+ b: :inline,
22
+ bdi: :inline,
23
+ bdo: :inline,
24
+ blockquote: :block,
25
+ br: :block,
26
+ button: :block, # Normally inline but Wgit treats as block.
27
+ caption: :block,
28
+ cite: :inline,
29
+ code: :inline,
30
+ data: :inline,
31
+ dd: :block,
32
+ del: :inline,
33
+ details: :block,
34
+ dfn: :inline,
35
+ div: :block,
36
+ dl: :block,
37
+ dt: :block,
38
+ em: :inline,
39
+ figcaption: :block,
40
+ figure: :block,
41
+ footer: :block,
42
+ h1: :block,
43
+ h2: :block,
44
+ h3: :block,
45
+ h4: :block,
46
+ h5: :block,
47
+ h6: :block,
48
+ header: :block,
49
+ hr: :block,
50
+ i: :inline,
51
+ input: :inline,
52
+ ins: :block,
53
+ kbd: :inline,
54
+ label: :inline,
55
+ legend: :block,
56
+ li: :block,
57
+ main: :block,
58
+ mark: :inline,
59
+ meter: :block,
60
+ ol: :block,
61
+ option: :block,
62
+ output: :block,
63
+ p: :block,
64
+ pre: :block,
65
+ q: :inline,
66
+ rb: :inline,
67
+ rt: :inline,
68
+ ruby: :inline,
69
+ s: :inline,
70
+ samp: :inline,
71
+ section: :block,
72
+ small: :inline,
73
+ span: :inline,
74
+ strong: :inline,
75
+ sub: :inline,
76
+ summary: :block,
77
+ sup: :inline,
78
+ td: :block,
79
+ textarea: :block,
80
+ th: :block,
81
+ time: :inline,
82
+ u: :inline,
83
+ ul: :block,
84
+ var: :inline,
85
+ wbr: :inline
86
+ }
87
+
88
+ class << self
89
+ # Set of HTML elements that make up the visible text on a page. These
90
+ # elements are used to initialize the Wgit::Document#text. See the
91
+ # README.md for how to add to this Hash dynamically.
92
+ attr_reader :text_elements
93
+ end
94
+
95
+ # The Nokogiri::HTML document object initialized from a HTML string.
96
+ attr_reader :parser
97
+
98
+ # Creates a new HTML to text extractor instance.
99
+ #
100
+ # @param parser [Nokogiri::HTML4::Document] The nokogiri parser object.
101
+ # @raise [StandardError] If the given parser is of an invalid type.
102
+ def initialize(parser)
103
+ assert_type(parser, Nokogiri::HTML4::Document)
104
+
105
+ @parser = parser
106
+ end
107
+
108
+ # Extracts and returns the text sentences from the @parser HTML.
109
+ #
110
+ # @return [Array<String>] An array of unique text sentences.
111
+ def extract_arr
112
+ return [] if @parser.to_s.empty?
113
+
114
+ text_str = extract_str
115
+
116
+ # Split the text_str into an Array of text sentences.
117
+ text_str
118
+ .split("\n")
119
+ .map(&:strip)
120
+ .reject(&:empty?)
121
+ end
122
+
123
+ # Extracts and returns a text string from the @parser HTML.
124
+ #
125
+ # @return [String] A string of text with \n delimiting sentences.
126
+ def extract_str
127
+ text_str = ""
128
+
129
+ iterate_child_nodes(@parser) do |node, display|
130
+ # Handle any special cases e.g. skip nodes we don't care about...
131
+ # <pre> nodes should have their contents displayed exactly as is.
132
+ if node_name(node) == :pre
133
+ text_str << "\n"
134
+ text_str << node.text
135
+ next
136
+ end
137
+
138
+ # Skip any child node of <pre> since they're handled as a special case above.
139
+ next if child_of?(:pre, node)
140
+
141
+ if node.text?
142
+ # Skip any text element that is purely whitespace.
143
+ next unless valid_text_content?(node.text)
144
+ else
145
+ # Skip a concrete node if it has other concrete child nodes as these
146
+ # will be iterated onto later.
147
+ #
148
+ # Process if node has no children or one child which is a valid text node.
149
+ next unless node.children.empty? || parent_of_text_node_only?(node)
150
+ end
151
+
152
+ # Apply display rules deciding if a new line is needed before node.text.
153
+ add_new_line = false
154
+ prev = prev_sibling_or_parent(node)
155
+
156
+ if node.text?
157
+ add_new_line = true unless prev && inline?(prev)
158
+ else
159
+ add_new_line = true if display == :block
160
+ add_new_line = true if prev && block?(prev)
161
+ end
162
+
163
+ text_str << "\n" if add_new_line
164
+ text_str << format_text(node.text)
165
+ end
166
+
167
+ text_str
168
+ .strip
169
+ .squeeze("\n")
170
+ .squeeze(" ")
171
+ end
172
+
173
+ private
174
+
175
+ def node_name(node)
176
+ node.name&.downcase&.to_sym
177
+ end
178
+
179
+ def display(node)
180
+ name = node_name(node)
181
+ Wgit::HTMLToText.text_elements[name]
182
+ end
183
+
184
+ def inline?(node)
185
+ display(node) == :inline
186
+ end
187
+
188
+ def block?(node)
189
+ display(node) == :block
190
+ end
191
+
192
+ # Returns the previous sibling of node or nil. Only valid text elements are
193
+ # returned i.e. non duplicates with valid text content.
194
+ def prev_sibling(node)
195
+ prev = node.previous
196
+
197
+ return nil unless prev
198
+ return prev unless prev.text?
199
+ return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
200
+ return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
201
+
202
+ prev.previous
203
+ end
204
+
205
+ # Returns node's previous sibling, parent or nil; in that order. Only valid
206
+ # text elements are returned i.e. non duplicates with valid text content.
207
+ def prev_sibling_or_parent(node)
208
+ prev = prev_sibling(node)
209
+ return prev if prev
210
+
211
+ node.parent
212
+ end
213
+
214
+ def child_of?(ancestor_name, node)
215
+ node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
216
+ end
217
+
218
+ # Returns true if any of the child nodes contain a non empty :text node.
219
+ def parent_of_text_node?(node)
220
+ node.children.any? { |child| child.text? && valid_text_content?(child.text) }
221
+ end
222
+
223
+ def parent_of_text_node_only?(node)
224
+ node.children.size == 1 && parent_of_text_node?(node)
225
+ end
226
+
227
+ # Returns true if text is not empty having removed all new lines.
228
+ def valid_text_content?(text)
229
+ !format_text(text).empty?
230
+ end
231
+
232
+ # Returns true if node is a text node.
233
+ # Duplicate text nodes (that follow a concrete node) are omitted.
234
+ def valid_text_node?(node)
235
+ node.text? && node.text != node.parent.text
236
+ end
237
+
238
+ def contains_new_line?(text)
239
+ ["\n", '\\n'].any? { |new_line| text.include?(new_line) }
240
+ end
241
+
242
+ # Remove special characters including any new lines; as semantic HTML will
243
+ # typically use <br> and/or block elements to denote a line break.
244
+ def format_text(text)
245
+ text
246
+ .encode("UTF-8", undef: :replace, invalid: :replace)
247
+ .gsub("\n", "")
248
+ .gsub('\\n', "")
249
+ .gsub("\r", "")
250
+ .gsub('\\r', "")
251
+ .gsub("\f", "")
252
+ .gsub('\\f', "")
253
+ .gsub("\t", "")
254
+ .gsub('\\t', "")
255
+ .gsub("&zwnj;", "")
256
+ .gsub("&nbsp;", " ")
257
+ .gsub("&#160;", " ")
258
+ .gsub("&thinsp;", " ")
259
+ .gsub("&ensp;", " ")
260
+ .gsub("&emsp;", " ")
261
+ .gsub('\u00a0', " ")
262
+ end
263
+
264
+ # Iterate over node and it's child nodes, yielding each to &block.
265
+ # Only HTMLToText.text_elements or valid :text nodes will be yielded.
266
+ # Duplicate text nodes (that follow a concrete node) are omitted.
267
+ def iterate_child_nodes(node, &block)
268
+ display = display(node)
269
+ text_node = valid_text_node?(node)
270
+
271
+ yield(node, display) if display || text_node
272
+ node.children.each { |child| iterate_child_nodes(child, &block) }
273
+ end
274
+
275
+ alias_method :extract, :extract_arr
276
+ end
277
+ end