wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/database/model.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require_relative '../utils'
|
4
|
-
|
5
|
-
module Wgit
|
6
|
-
# Module used to build the database collection objects, forming a data model.
|
7
|
-
module Model
|
8
|
-
# The data model for a Wgit::Url collection object and for an embedded
|
9
|
-
# 'url' inside a Wgit::Document collection object.
|
10
|
-
#
|
11
|
-
# @param url [Wgit::Url] The Url data object.
|
12
|
-
# @return [Hash] The URL model ready for DB insertion.
|
13
|
-
def self.url(url)
|
14
|
-
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
15
|
-
|
16
|
-
model = url.to_h
|
17
|
-
select_bson_types(model)
|
18
|
-
end
|
19
|
-
|
20
|
-
# The data model for a Wgit::Document collection object.
|
21
|
-
#
|
22
|
-
# @param doc [Wgit::Document] The Document data object.
|
23
|
-
# @return [Hash] The Document model ready for DB insertion.
|
24
|
-
def self.document(doc)
|
25
|
-
raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
|
26
|
-
|
27
|
-
model = doc.to_h(include_html: false, include_score: false)
|
28
|
-
model['url'] = url(doc.url) # Expand Url String into full object.
|
29
|
-
|
30
|
-
select_bson_types(model)
|
31
|
-
end
|
32
|
-
|
33
|
-
# Common fields when inserting a record into the DB.
|
34
|
-
#
|
35
|
-
# @return [Hash] Insertion fields common to all models.
|
36
|
-
def self.common_insert_data
|
37
|
-
{
|
38
|
-
date_added: Wgit::Utils.time_stamp,
|
39
|
-
date_modified: Wgit::Utils.time_stamp
|
40
|
-
}
|
41
|
-
end
|
42
|
-
|
43
|
-
# Common fields when updating a record in the DB.
|
44
|
-
#
|
45
|
-
# @return [Hash] Update fields common to all models.
|
46
|
-
def self.common_update_data
|
47
|
-
{
|
48
|
-
date_modified: Wgit::Utils.time_stamp
|
49
|
-
}
|
50
|
-
end
|
51
|
-
|
52
|
-
# Returns the model having removed non bson types (for use with MongoDB).
|
53
|
-
#
|
54
|
-
# @param model_hash [Hash] The model Hash to sanitize.
|
55
|
-
# @return [Hash] The model Hash with non bson types removed.
|
56
|
-
def self.select_bson_types(model_hash)
|
57
|
-
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|