news_scraper 0.1.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -1
- data/README.md +2 -2
- data/config/article_scrape_patterns.yml +15 -0
- data/lib/news_scraper/cli.rb +9 -8
- data/lib/news_scraper/configuration.rb +33 -0
- data/lib/news_scraper/errors.rb +12 -3
- data/lib/news_scraper/extractors_helpers.rb +6 -2
- data/lib/news_scraper/scraper.rb +11 -4
- data/lib/news_scraper/trainer/preset_selector.rb +30 -40
- data/lib/news_scraper/trainer/url_trainer.rb +15 -20
- data/lib/news_scraper/transformers/article.rb +19 -20
- data/lib/news_scraper/transformers/nokogiri/functions.rb +15 -0
- data/lib/news_scraper/transformers/trainer_article.rb +17 -3
- data/lib/news_scraper/version.rb +1 -1
- data/lib/news_scraper.rb +18 -2
- data/news_scraper.gemspec +2 -0
- metadata +38 -3
- data/lib/news_scraper/constants.rb +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db7d631f3f6cf73ff2e57b9e472804651b9fe1e0
|
4
|
+
data.tar.gz: 1045878eb97749d6b264a486ac34bfb89f4796dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a53423be5dbda33ead7dbb46bc494e40fcf412172a291496128b40512c985fc157c646481e1b8a183be2f709486e2cc7ec27d47a17bb9715c3b19dbe09dd7e42
|
7
|
+
data.tar.gz: eb43f129a0ca1a9f6eb02f24bfeb891583c96b0cce548afe43cf613231e8908a66e381347aeb16f5be9ea4af3ca96f82dcaf4da005951fb5ae5936d3f3530cbc
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -50,7 +50,7 @@ How the `Scraper` extracts and parses for the information is determined by scrap
|
|
50
50
|
|
51
51
|
Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
|
52
52
|
|
53
|
-
In addition, the `
|
53
|
+
In addition, the `url` and `root_domain`(hostname) of the article will be returned in the hash too.
|
54
54
|
|
55
55
|
Example
|
56
56
|
```
|
@@ -62,7 +62,7 @@ Example
|
|
62
62
|
section: 'technology',
|
63
63
|
datetime: '1991-10-05T12:00:00+00:00',
|
64
64
|
title: 'Linus Linux',
|
65
|
-
|
65
|
+
url: 'https://linusworld.com/the-linux-kernel',
|
66
66
|
root_domain: 'linusworld.com'
|
67
67
|
}
|
68
68
|
```
|
@@ -52,10 +52,16 @@ presets:
|
|
52
52
|
article_tag: &article_tag_keywords
|
53
53
|
method: "xpath"
|
54
54
|
pattern: "//meta[@property='article:tag']/@content"
|
55
|
+
news_keywords: &news_keywords_keywords
|
56
|
+
method: "xpath"
|
57
|
+
pattern: "//meta[@name='news_keywords']/@content"
|
55
58
|
section:
|
56
59
|
meta: &meta_section
|
57
60
|
method: "xpath"
|
58
61
|
pattern: "//meta[@property='article:section']/@content"
|
62
|
+
section: §ion_section
|
63
|
+
method: "xpath"
|
64
|
+
pattern: "//meta[@name='section']/@content"
|
59
65
|
datetime:
|
60
66
|
article_date_original: &article_date_original_datetime
|
61
67
|
method: xpath
|
@@ -87,6 +93,15 @@ presets:
|
|
87
93
|
sailthru_date: &sailthru_date_datetime
|
88
94
|
method: xpath
|
89
95
|
pattern: //meta[@name='sailthru.date']/@content
|
96
|
+
time: &time_datetime
|
97
|
+
method: xpath
|
98
|
+
pattern: //time/@datetime
|
99
|
+
date_published_datetime: &date_published_datetime_datetime
|
100
|
+
method: xpath
|
101
|
+
pattern: //meta[@itemprop="datePublished"]/@datetime
|
102
|
+
date_published_content: &date_published_content_datetime
|
103
|
+
method: xpath
|
104
|
+
pattern: //meta[@itemprop="datePublished"]/@content
|
90
105
|
title:
|
91
106
|
html: &html_title
|
92
107
|
method: "xpath"
|
data/lib/news_scraper/cli.rb
CHANGED
@@ -6,14 +6,13 @@ module NewsScraper
|
|
6
6
|
|
7
7
|
DEFAULT_COLOR = "\x1b[36m".freeze
|
8
8
|
|
9
|
-
def log(message, color: DEFAULT_COLOR
|
10
|
-
message += "\n" if new_line
|
9
|
+
def log(message, color: DEFAULT_COLOR)
|
11
10
|
$stdout.puts "#{color}┃\x1b[0m " + message
|
12
11
|
end
|
13
12
|
|
14
|
-
def log_lines(message, color: DEFAULT_COLOR
|
13
|
+
def log_lines(message, color: DEFAULT_COLOR)
|
15
14
|
message.split("\n").each do |line|
|
16
|
-
log(line, color: color
|
15
|
+
log(line, color: color)
|
17
16
|
end
|
18
17
|
end
|
19
18
|
|
@@ -49,8 +48,8 @@ module NewsScraper
|
|
49
48
|
buf = -1
|
50
49
|
available = (1..options.length).to_a
|
51
50
|
until available.include?(buf.to_i)
|
52
|
-
begin
|
53
|
-
|
51
|
+
buf = begin
|
52
|
+
Readline.readline("\x1b[34m┃ > \x1b[33m", true)
|
54
53
|
rescue Interrupt
|
55
54
|
nil
|
56
55
|
end
|
@@ -71,14 +70,16 @@ module NewsScraper
|
|
71
70
|
|
72
71
|
## Fancy Headers and Footers
|
73
72
|
|
74
|
-
def put_header(text = "", color
|
73
|
+
def put_header(text = "", color: DEFAULT_COLOR)
|
75
74
|
put_edge(color, "┏━━ ", text)
|
76
75
|
end
|
77
76
|
|
78
|
-
def put_footer(color
|
77
|
+
def put_footer(color: DEFAULT_COLOR)
|
79
78
|
put_edge(color, "┗", "")
|
80
79
|
end
|
81
80
|
|
81
|
+
private
|
82
|
+
|
82
83
|
def put_edge(color, prefix, text)
|
83
84
|
ptext = "#{color}#{prefix}#{text}"
|
84
85
|
textwidth = printing_width(ptext)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
class Configuration
|
3
|
+
DEFAULT_SCRAPE_PATTERNS_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
|
4
|
+
attr_accessor :fetch_method, :scrape_patterns_filepath
|
5
|
+
|
6
|
+
# <code>NewsScraper::Configuration.initialize</code> initializes the scrape_patterns_filepath
|
7
|
+
# and the fetch_method to the <code>DEFAULT_SCRAPE_PATTERNS_FILEPATH</code>
|
8
|
+
#
|
9
|
+
# Set the <code>scrape_patterns_filepath</code> to <code>nil</code> to disable saving during training
|
10
|
+
#
|
11
|
+
def initialize
|
12
|
+
self.scrape_patterns_filepath = DEFAULT_SCRAPE_PATTERNS_FILEPATH
|
13
|
+
self.fetch_method = proc { default_scrape_patterns }
|
14
|
+
end
|
15
|
+
|
16
|
+
# <code>NewsScraper::Configuration.scrape_patterns</code> proxies scrape_patterns
|
17
|
+
# requests to <code>fetch_method</code>:
|
18
|
+
#
|
19
|
+
# *Returns*
|
20
|
+
# - The result of calling the <code>fetch_method</code> proc, expected to be a hash
|
21
|
+
#
|
22
|
+
def scrape_patterns
|
23
|
+
fetch_method.call
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def default_scrape_patterns
|
29
|
+
@default_scrape_patterns ||= {}
|
30
|
+
@default_scrape_patterns[scrape_patterns_filepath] ||= YAML.load_file(scrape_patterns_filepath)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/news_scraper/errors.rb
CHANGED
@@ -1,14 +1,23 @@
|
|
1
1
|
|
2
2
|
module NewsScraper
|
3
|
-
class ResponseError < StandardError
|
3
|
+
class ResponseError < StandardError
|
4
|
+
attr_reader :error_code, :message, :url
|
5
|
+
|
6
|
+
def initialize(opts = {})
|
7
|
+
@error_code = opts[:error_code]
|
8
|
+
@message = opts[:message]
|
9
|
+
@url = opts[:url]
|
10
|
+
super
|
11
|
+
end
|
12
|
+
end
|
4
13
|
|
5
14
|
module Transformers
|
6
15
|
class ScrapePatternNotDefined < StandardError
|
7
|
-
attr_reader :root_domain, :
|
16
|
+
attr_reader :root_domain, :url
|
8
17
|
|
9
18
|
def initialize(opts = {})
|
10
19
|
@root_domain = opts[:root_domain]
|
11
|
-
@
|
20
|
+
@url = opts[:url]
|
12
21
|
super
|
13
22
|
end
|
14
23
|
end
|
@@ -10,9 +10,13 @@ module NewsScraper
|
|
10
10
|
|
11
11
|
CLI.put_header(url)
|
12
12
|
CLI.log "Beginning HTTP request for #{url}"
|
13
|
-
response = HTTParty.get(url)
|
13
|
+
response = HTTParty.get(url, headers: { "User-Agent" => "news-scraper-#{NewsScraper::VERSION}" })
|
14
14
|
|
15
|
-
raise ResponseError.new(
|
15
|
+
raise ResponseError.new(
|
16
|
+
error_code: response.code,
|
17
|
+
message: response.message,
|
18
|
+
url: url
|
19
|
+
) unless response.code == 200
|
16
20
|
|
17
21
|
CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
|
18
22
|
CLI.put_footer
|
data/lib/news_scraper/scraper.rb
CHANGED
@@ -16,6 +16,7 @@ module NewsScraper
|
|
16
16
|
#
|
17
17
|
# *Raises*
|
18
18
|
# - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
|
19
|
+
# - Will <code>yield</code> the error if a block is given
|
19
20
|
# - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
|
20
21
|
# - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
|
21
22
|
# - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
|
@@ -27,13 +28,19 @@ module NewsScraper
|
|
27
28
|
article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
|
28
29
|
|
29
30
|
transformed_articles = []
|
31
|
+
|
30
32
|
article_urls.each do |article_url|
|
31
33
|
payload = Extractors::Article.new(url: article_url).extract
|
34
|
+
article_transformer = Transformers::Article.new(url: article_url, payload: payload)
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
begin
|
37
|
+
transformed_article = article_transformer.transform
|
38
|
+
transformed_articles << transformed_article
|
39
|
+
yield transformed_article if block_given?
|
40
|
+
rescue Transformers::ScrapePatternNotDefined => e
|
41
|
+
raise e unless block_given?
|
42
|
+
yield e
|
43
|
+
end
|
37
44
|
end
|
38
45
|
|
39
46
|
transformed_articles
|
@@ -3,18 +3,16 @@ module NewsScraper
|
|
3
3
|
class PresetSelector
|
4
4
|
PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
|
5
5
|
|
6
|
-
def initialize(
|
6
|
+
def initialize(url:, payload:)
|
7
7
|
@url = url
|
8
8
|
@payload = payload
|
9
|
-
@data_type_presets = data_type_presets
|
10
|
-
@data_type = data_type
|
11
9
|
end
|
12
10
|
|
13
|
-
def select
|
14
|
-
|
11
|
+
def select(data_type)
|
12
|
+
pattern_options = pattern_options(data_type)
|
15
13
|
|
16
14
|
selected_option = CLI.prompt_with_options(
|
17
|
-
"Select which preset to use for #{
|
15
|
+
"Select which preset to use for #{data_type}:",
|
18
16
|
pattern_options.keys
|
19
17
|
)
|
20
18
|
|
@@ -27,50 +25,42 @@ module NewsScraper
|
|
27
25
|
end
|
28
26
|
return if selected_option == 'skip'
|
29
27
|
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
selected_preset_code = pattern_options[selected_option]
|
29
|
+
result = transform_results[data_type][selected_preset_code].merge(
|
30
|
+
'variable' => [selected_preset_code, data_type].join('_')
|
31
|
+
)
|
32
|
+
result.delete('data')
|
33
|
+
result
|
33
34
|
end
|
34
35
|
|
35
36
|
private
|
36
37
|
|
37
|
-
def pattern_options
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
%w(xpath css).each do |pattern_provider|
|
47
|
-
temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
|
38
|
+
def pattern_options(data_type)
|
39
|
+
# Add valid options from the transformed results
|
40
|
+
options = transform_results[data_type].each_with_object({}) do |(option, details), valid_options|
|
41
|
+
next unless details['data'] && !details['data'].empty?
|
42
|
+
table_key = Terminal::Table.new do |t|
|
43
|
+
t << ['method', details['method']]
|
44
|
+
t << ['pattern', details['pattern']]
|
45
|
+
t << ['data', details['data']]
|
48
46
|
end
|
49
|
-
|
47
|
+
valid_options["\n#{table_key}"] = option
|
50
48
|
end
|
51
|
-
end
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
@results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
|
58
|
-
scrape_details[@data_type] = preset_details
|
59
|
-
train_transformer = Transformers::TrainerArticle.new(
|
60
|
-
url: @url,
|
61
|
-
payload: @payload,
|
62
|
-
scrape_details: scrape_details,
|
63
|
-
)
|
50
|
+
# Add in options to customize the pattern
|
51
|
+
%w(xpath css).each do |pattern_provider|
|
52
|
+
options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
|
53
|
+
end
|
64
54
|
|
65
|
-
|
66
|
-
|
67
|
-
end.to_a
|
55
|
+
# Add option in to skip
|
56
|
+
options.merge('skip' => 'skip')
|
68
57
|
end
|
69
58
|
|
70
|
-
def
|
71
|
-
@
|
72
|
-
|
73
|
-
|
59
|
+
def transform_results
|
60
|
+
@transform_results ||= Transformers::TrainerArticle.new(
|
61
|
+
url: @url,
|
62
|
+
payload: @payload
|
63
|
+
).transform
|
74
64
|
end
|
75
65
|
end
|
76
66
|
end
|
@@ -8,18 +8,17 @@ module NewsScraper
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def train
|
11
|
-
return if
|
11
|
+
return if NewsScraper.configuration.scrape_patterns['domains'].key?(@root_domain)
|
12
12
|
|
13
13
|
CLI.put_header(@root_domain)
|
14
|
-
CLI.log("There is no scrape pattern defined for #{@root_domain}
|
14
|
+
CLI.log("There is no scrape pattern defined for #{@root_domain}")
|
15
15
|
CLI.log "Fetching information..."
|
16
16
|
CLI.put_footer
|
17
17
|
|
18
18
|
selected_presets = {}
|
19
|
-
|
19
|
+
NewsScraper.configuration.scrape_patterns['data_types'].each do |data_type|
|
20
20
|
selected_presets[data_type] = selected_pattern(data_type)
|
21
21
|
end
|
22
|
-
|
23
22
|
save_selected_presets(selected_presets)
|
24
23
|
end
|
25
24
|
|
@@ -27,29 +26,29 @@ module NewsScraper
|
|
27
26
|
|
28
27
|
def selected_pattern(data_type)
|
29
28
|
CLI.put_header("Determining information for #{data_type}")
|
30
|
-
|
31
|
-
pattern = if data_type_presets.nil?
|
29
|
+
pattern = if NewsScraper.configuration.scrape_patterns['presets'][data_type].nil?
|
32
30
|
CLI.log("No presets were found for #{data_type}. Skipping to next.")
|
33
31
|
nil
|
34
32
|
else
|
35
|
-
|
36
|
-
url: @url,
|
37
|
-
payload: @payload,
|
38
|
-
data_type_presets: data_type_presets,
|
39
|
-
data_type: data_type
|
40
|
-
).select
|
33
|
+
preset_selector.select(data_type)
|
41
34
|
end
|
42
35
|
CLI.put_footer
|
43
|
-
|
44
36
|
pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
|
45
37
|
end
|
46
38
|
|
39
|
+
def preset_selector
|
40
|
+
@preset_selector ||= PresetSelector.new(url: @url, payload: @payload)
|
41
|
+
end
|
42
|
+
|
47
43
|
def save_selected_presets(selected_presets)
|
48
|
-
|
44
|
+
return unless NewsScraper.configuration.scrape_patterns_filepath
|
45
|
+
|
46
|
+
current_content = File.read(NewsScraper.configuration.scrape_patterns_filepath).chomp
|
49
47
|
new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
|
50
48
|
|
51
|
-
File.write(
|
52
|
-
CLI.log("Successfully wrote presets for #{@root_domain} to
|
49
|
+
File.write(NewsScraper.configuration.scrape_patterns_filepath, new_content)
|
50
|
+
CLI.log("Successfully wrote presets for #{@root_domain} to"\
|
51
|
+
" #{NewsScraper.configuration.scrape_patterns_filepath}.")
|
53
52
|
end
|
54
53
|
|
55
54
|
def build_domain_yaml(selected_presets)
|
@@ -65,10 +64,6 @@ module NewsScraper
|
|
65
64
|
end
|
66
65
|
output_string.join("\n")
|
67
66
|
end
|
68
|
-
|
69
|
-
def article_scrape_patterns
|
70
|
-
@article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
|
71
|
-
end
|
72
67
|
end
|
73
68
|
end
|
74
69
|
end
|
@@ -2,6 +2,7 @@ require 'nokogiri'
|
|
2
2
|
require 'sanitize'
|
3
3
|
require 'readability'
|
4
4
|
require 'htmlbeautifier'
|
5
|
+
require 'news_scraper/transformers/nokogiri/functions'
|
5
6
|
|
6
7
|
module NewsScraper
|
7
8
|
module Transformers
|
@@ -13,9 +14,8 @@ module NewsScraper
|
|
13
14
|
# - <code>payload</code>: keyword arg - the result of the scrape
|
14
15
|
#
|
15
16
|
def initialize(url:, payload:)
|
16
|
-
|
17
|
-
@
|
18
|
-
@root_domain = uri_parser.host
|
17
|
+
@url = url
|
18
|
+
@root_domain = URIParser.new(url).host
|
19
19
|
@payload = payload
|
20
20
|
end
|
21
21
|
|
@@ -28,37 +28,36 @@ module NewsScraper
|
|
28
28
|
# - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
|
29
29
|
#
|
30
30
|
def transform
|
31
|
-
|
32
|
-
|
33
|
-
transformed_response.merge(
|
31
|
+
scrape_details = NewsScraper.configuration.scrape_patterns['domains'][@root_domain]
|
32
|
+
raise ScrapePatternNotDefined.new(url: @url, root_domain: @root_domain) unless scrape_details
|
33
|
+
transformed_response(scrape_details).merge(url: @url, root_domain: @root_domain)
|
34
34
|
end
|
35
35
|
|
36
36
|
private
|
37
37
|
|
38
|
-
def scrape_details
|
39
|
-
|
40
|
-
|
38
|
+
def transformed_response(scrape_details)
|
39
|
+
NewsScraper.configuration.scrape_patterns['data_types'].each_with_object({}) do |data_type, response|
|
40
|
+
response[data_type.to_sym] = nil
|
41
|
+
next unless scrape_details[data_type]
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
response[data_type.to_sym] = parsed_data(
|
44
|
+
scrape_details[data_type]['method'].to_sym,
|
45
|
+
scrape_details[data_type]['pattern']
|
46
|
+
)
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
|
-
def parsed_data(
|
49
|
-
return nil unless scrape_details[data_type]
|
50
|
-
|
51
|
-
scrape_method = scrape_details[data_type]['method'].to_sym
|
50
|
+
def parsed_data(scrape_method, scrape_pattern)
|
52
51
|
case scrape_method
|
53
52
|
when :xpath
|
54
|
-
noko_html = Nokogiri::HTML(@payload)
|
53
|
+
noko_html = ::Nokogiri::HTML(@payload)
|
55
54
|
Sanitize.fragment(
|
56
|
-
noko_html.
|
55
|
+
noko_html.xpath("(#{scrape_pattern})[1]", Nokogiri::Functions.new)
|
57
56
|
).squish
|
58
57
|
when :css
|
59
|
-
noko_html = Nokogiri::HTML(@payload)
|
58
|
+
noko_html = ::Nokogiri::HTML(@payload)
|
60
59
|
Sanitize.fragment(
|
61
|
-
noko_html.
|
60
|
+
noko_html.css(scrape_pattern)
|
62
61
|
).squish
|
63
62
|
when :readability
|
64
63
|
content = Readability::Document.new(
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module NewsScraper
|
4
|
+
module Transformers
|
5
|
+
module Nokogiri
|
6
|
+
class Functions
|
7
|
+
# Implements fn:string-join of XPath 2.0
|
8
|
+
def string_join(nodeset, separator)
|
9
|
+
nodeset.map(&:text).join(separator)
|
10
|
+
end
|
11
|
+
alias_method :'string-join', :string_join
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -6,12 +6,26 @@ module NewsScraper
|
|
6
6
|
# *Params*
|
7
7
|
# - <code>url</code>: keyword arg - the url on which scraping was done
|
8
8
|
# - <code>payload</code>: keyword arg - the result of the scrape
|
9
|
-
# - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
|
10
9
|
#
|
11
|
-
def initialize(url:, payload
|
12
|
-
@scrape_details = scrape_details
|
10
|
+
def initialize(url:, payload:)
|
13
11
|
super(url: url, payload: payload)
|
14
12
|
end
|
13
|
+
|
14
|
+
# Transform the article
|
15
|
+
#
|
16
|
+
# *Returns*
|
17
|
+
# - <code>transformed_response</code>: tries all possible presets and returns a hash representing the results
|
18
|
+
#
|
19
|
+
def transform
|
20
|
+
presets = NewsScraper.configuration.scrape_patterns['presets']
|
21
|
+
transformed_response = presets.each_with_object({}) do |(data_type, preset_options), response|
|
22
|
+
response[data_type] = preset_options.each_with_object({}) do |(option, scrape_details), data_type_options|
|
23
|
+
data = parsed_data(scrape_details['method'].to_sym, scrape_details['pattern'])
|
24
|
+
data_type_options[option] = scrape_details.merge('data' => data)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
transformed_response.merge('url' => @url, 'root_domain' => @root_domain)
|
28
|
+
end
|
15
29
|
end
|
16
30
|
end
|
17
31
|
end
|
data/lib/news_scraper/version.rb
CHANGED
data/lib/news_scraper.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'yaml'
|
3
|
+
require 'terminal-table'
|
3
4
|
|
4
|
-
require 'news_scraper/
|
5
|
+
require 'news_scraper/configuration'
|
5
6
|
require 'news_scraper/uri_parser'
|
6
7
|
require 'news_scraper/active_support_lite/string'
|
7
8
|
|
@@ -23,11 +24,12 @@ require 'news_scraper/trainer'
|
|
23
24
|
|
24
25
|
module NewsScraper
|
25
26
|
extend self
|
27
|
+
attr_writer :configuration
|
26
28
|
|
27
29
|
# <code>NewsScraper::train</code> is an interactive command-line prompt that:
|
28
30
|
#
|
29
31
|
# 1. Collates all articles for the given :query
|
30
|
-
# 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>
|
32
|
+
# 2. Grep for <code>:data_types</code> using <code>:presets</code> in the config set in the <code>configuration</code>
|
31
33
|
# 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
|
32
34
|
# 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
|
33
35
|
# N.B: User may ignore all presets and manually configure it in the YAML file
|
@@ -36,7 +38,21 @@ module NewsScraper
|
|
36
38
|
# *Params*
|
37
39
|
# - <code>query</code>: a keyword arugment specifying the query to train on
|
38
40
|
#
|
41
|
+
# :nocov:
|
39
42
|
def train(query:)
|
40
43
|
Trainer.train(query: query)
|
41
44
|
end
|
45
|
+
# :nocov:
|
46
|
+
|
47
|
+
def configuration
|
48
|
+
@configuration ||= Configuration.new
|
49
|
+
end
|
50
|
+
|
51
|
+
def reset_configuration
|
52
|
+
@configuration = Configuration.new
|
53
|
+
end
|
54
|
+
|
55
|
+
def configure
|
56
|
+
yield(configuration)
|
57
|
+
end
|
42
58
|
end
|
data/news_scraper.gemspec
CHANGED
@@ -29,6 +29,7 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
|
30
30
|
spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
|
31
31
|
spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
|
32
|
+
spec.add_dependency 'terminal-table', '~> 1.5', '>= 1.5.2'
|
32
33
|
|
33
34
|
spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
|
34
35
|
spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
|
@@ -38,4 +39,5 @@ Gem::Specification.new do |spec|
|
|
38
39
|
spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
|
39
40
|
spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
|
40
41
|
spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
|
42
|
+
spec.add_development_dependency 'simplecov', '~> 0.12.0'
|
41
43
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Wu
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-09-
|
12
|
+
date: 2016-09-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -111,6 +111,26 @@ dependencies:
|
|
111
111
|
- - ">="
|
112
112
|
- !ruby/object:Gem::Version
|
113
113
|
version: 1.1.1
|
114
|
+
- !ruby/object:Gem::Dependency
|
115
|
+
name: terminal-table
|
116
|
+
requirement: !ruby/object:Gem::Requirement
|
117
|
+
requirements:
|
118
|
+
- - "~>"
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '1.5'
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: 1.5.2
|
124
|
+
type: :runtime
|
125
|
+
prerelease: false
|
126
|
+
version_requirements: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '1.5'
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: 1.5.2
|
114
134
|
- !ruby/object:Gem::Dependency
|
115
135
|
name: bundler
|
116
136
|
requirement: !ruby/object:Gem::Requirement
|
@@ -271,6 +291,20 @@ dependencies:
|
|
271
291
|
- - ">="
|
272
292
|
- !ruby/object:Gem::Version
|
273
293
|
version: 4.2.2
|
294
|
+
- !ruby/object:Gem::Dependency
|
295
|
+
name: simplecov
|
296
|
+
requirement: !ruby/object:Gem::Requirement
|
297
|
+
requirements:
|
298
|
+
- - "~>"
|
299
|
+
- !ruby/object:Gem::Version
|
300
|
+
version: 0.12.0
|
301
|
+
type: :development
|
302
|
+
prerelease: false
|
303
|
+
version_requirements: !ruby/object:Gem::Requirement
|
304
|
+
requirements:
|
305
|
+
- - "~>"
|
306
|
+
- !ruby/object:Gem::Version
|
307
|
+
version: 0.12.0
|
274
308
|
description: A collection of extractors, transformers and loaders for scraping news
|
275
309
|
websites and syndicates.
|
276
310
|
email:
|
@@ -296,7 +330,7 @@ files:
|
|
296
330
|
- lib/news_scraper.rb
|
297
331
|
- lib/news_scraper/active_support_lite/string.rb
|
298
332
|
- lib/news_scraper/cli.rb
|
299
|
-
- lib/news_scraper/
|
333
|
+
- lib/news_scraper/configuration.rb
|
300
334
|
- lib/news_scraper/errors.rb
|
301
335
|
- lib/news_scraper/extractors/article.rb
|
302
336
|
- lib/news_scraper/extractors/google_news_rss.rb
|
@@ -306,6 +340,7 @@ files:
|
|
306
340
|
- lib/news_scraper/trainer/preset_selector.rb
|
307
341
|
- lib/news_scraper/trainer/url_trainer.rb
|
308
342
|
- lib/news_scraper/transformers/article.rb
|
343
|
+
- lib/news_scraper/transformers/nokogiri/functions.rb
|
309
344
|
- lib/news_scraper/transformers/trainer_article.rb
|
310
345
|
- lib/news_scraper/uri_parser.rb
|
311
346
|
- lib/news_scraper/version.rb
|