news_scraper 0.1.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -1
- data/README.md +2 -2
- data/config/article_scrape_patterns.yml +15 -0
- data/lib/news_scraper/cli.rb +9 -8
- data/lib/news_scraper/configuration.rb +33 -0
- data/lib/news_scraper/errors.rb +12 -3
- data/lib/news_scraper/extractors_helpers.rb +6 -2
- data/lib/news_scraper/scraper.rb +11 -4
- data/lib/news_scraper/trainer/preset_selector.rb +30 -40
- data/lib/news_scraper/trainer/url_trainer.rb +15 -20
- data/lib/news_scraper/transformers/article.rb +19 -20
- data/lib/news_scraper/transformers/nokogiri/functions.rb +15 -0
- data/lib/news_scraper/transformers/trainer_article.rb +17 -3
- data/lib/news_scraper/version.rb +1 -1
- data/lib/news_scraper.rb +18 -2
- data/news_scraper.gemspec +2 -0
- metadata +38 -3
- data/lib/news_scraper/constants.rb +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db7d631f3f6cf73ff2e57b9e472804651b9fe1e0
|
4
|
+
data.tar.gz: 1045878eb97749d6b264a486ac34bfb89f4796dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a53423be5dbda33ead7dbb46bc494e40fcf412172a291496128b40512c985fc157c646481e1b8a183be2f709486e2cc7ec27d47a17bb9715c3b19dbe09dd7e42
|
7
|
+
data.tar.gz: eb43f129a0ca1a9f6eb02f24bfeb891583c96b0cce548afe43cf613231e8908a66e381347aeb16f5be9ea4af3ca96f82dcaf4da005951fb5ae5936d3f3530cbc
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -50,7 +50,7 @@ How the `Scraper` extracts and parses for the information is determined by scrap
|
|
50
50
|
|
51
51
|
Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
|
52
52
|
|
53
|
-
In addition, the `
|
53
|
+
In addition, the `url` and `root_domain`(hostname) of the article will be returned in the hash too.
|
54
54
|
|
55
55
|
Example
|
56
56
|
```
|
@@ -62,7 +62,7 @@ Example
|
|
62
62
|
section: 'technology',
|
63
63
|
datetime: '1991-10-05T12:00:00+00:00',
|
64
64
|
title: 'Linus Linux',
|
65
|
-
|
65
|
+
url: 'https://linusworld.com/the-linux-kernel',
|
66
66
|
root_domain: 'linusworld.com'
|
67
67
|
}
|
68
68
|
```
|
@@ -52,10 +52,16 @@ presets:
|
|
52
52
|
article_tag: &article_tag_keywords
|
53
53
|
method: "xpath"
|
54
54
|
pattern: "//meta[@property='article:tag']/@content"
|
55
|
+
news_keywords: &news_keywords_keywords
|
56
|
+
method: "xpath"
|
57
|
+
pattern: "//meta[@name='news_keywords']/@content"
|
55
58
|
section:
|
56
59
|
meta: &meta_section
|
57
60
|
method: "xpath"
|
58
61
|
pattern: "//meta[@property='article:section']/@content"
|
62
|
+
section: §ion_section
|
63
|
+
method: "xpath"
|
64
|
+
pattern: "//meta[@name='section']/@content"
|
59
65
|
datetime:
|
60
66
|
article_date_original: &article_date_original_datetime
|
61
67
|
method: xpath
|
@@ -87,6 +93,15 @@ presets:
|
|
87
93
|
sailthru_date: &sailthru_date_datetime
|
88
94
|
method: xpath
|
89
95
|
pattern: //meta[@name='sailthru.date']/@content
|
96
|
+
time: &time_datetime
|
97
|
+
method: xpath
|
98
|
+
pattern: //time/@datetime
|
99
|
+
date_published_datetime: &date_published_datetime_datetime
|
100
|
+
method: xpath
|
101
|
+
pattern: //meta[@itemprop="datePublished"]/@datetime
|
102
|
+
date_published_content: &date_published_content_datetime
|
103
|
+
method: xpath
|
104
|
+
pattern: //meta[@itemprop="datePublished"]/@content
|
90
105
|
title:
|
91
106
|
html: &html_title
|
92
107
|
method: "xpath"
|
data/lib/news_scraper/cli.rb
CHANGED
@@ -6,14 +6,13 @@ module NewsScraper
|
|
6
6
|
|
7
7
|
DEFAULT_COLOR = "\x1b[36m".freeze
|
8
8
|
|
9
|
-
def log(message, color: DEFAULT_COLOR
|
10
|
-
message += "\n" if new_line
|
9
|
+
def log(message, color: DEFAULT_COLOR)
|
11
10
|
$stdout.puts "#{color}┃\x1b[0m " + message
|
12
11
|
end
|
13
12
|
|
14
|
-
def log_lines(message, color: DEFAULT_COLOR
|
13
|
+
def log_lines(message, color: DEFAULT_COLOR)
|
15
14
|
message.split("\n").each do |line|
|
16
|
-
log(line, color: color
|
15
|
+
log(line, color: color)
|
17
16
|
end
|
18
17
|
end
|
19
18
|
|
@@ -49,8 +48,8 @@ module NewsScraper
|
|
49
48
|
buf = -1
|
50
49
|
available = (1..options.length).to_a
|
51
50
|
until available.include?(buf.to_i)
|
52
|
-
begin
|
53
|
-
|
51
|
+
buf = begin
|
52
|
+
Readline.readline("\x1b[34m┃ > \x1b[33m", true)
|
54
53
|
rescue Interrupt
|
55
54
|
nil
|
56
55
|
end
|
@@ -71,14 +70,16 @@ module NewsScraper
|
|
71
70
|
|
72
71
|
## Fancy Headers and Footers
|
73
72
|
|
74
|
-
def put_header(text = "", color
|
73
|
+
def put_header(text = "", color: DEFAULT_COLOR)
|
75
74
|
put_edge(color, "┏━━ ", text)
|
76
75
|
end
|
77
76
|
|
78
|
-
def put_footer(color
|
77
|
+
def put_footer(color: DEFAULT_COLOR)
|
79
78
|
put_edge(color, "┗", "")
|
80
79
|
end
|
81
80
|
|
81
|
+
private
|
82
|
+
|
82
83
|
def put_edge(color, prefix, text)
|
83
84
|
ptext = "#{color}#{prefix}#{text}"
|
84
85
|
textwidth = printing_width(ptext)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module NewsScraper
|
2
|
+
class Configuration
|
3
|
+
DEFAULT_SCRAPE_PATTERNS_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
|
4
|
+
attr_accessor :fetch_method, :scrape_patterns_filepath
|
5
|
+
|
6
|
+
# <code>NewsScraper::Configuration.initialize</code> initializes the scrape_patterns_filepath
|
7
|
+
# and the fetch_method to the <code>DEFAULT_SCRAPE_PATTERNS_FILEPATH</code>
|
8
|
+
#
|
9
|
+
# Set the <code>scrape_patterns_filepath</code> to <code>nil</code> to disable saving during training
|
10
|
+
#
|
11
|
+
def initialize
|
12
|
+
self.scrape_patterns_filepath = DEFAULT_SCRAPE_PATTERNS_FILEPATH
|
13
|
+
self.fetch_method = proc { default_scrape_patterns }
|
14
|
+
end
|
15
|
+
|
16
|
+
# <code>NewsScraper::Configuration.scrape_patterns</code> proxies scrape_patterns
|
17
|
+
# requests to <code>fetch_method</code>:
|
18
|
+
#
|
19
|
+
# *Returns*
|
20
|
+
# - The result of calling the <code>fetch_method</code> proc, expected to be a hash
|
21
|
+
#
|
22
|
+
def scrape_patterns
|
23
|
+
fetch_method.call
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def default_scrape_patterns
|
29
|
+
@default_scrape_patterns ||= {}
|
30
|
+
@default_scrape_patterns[scrape_patterns_filepath] ||= YAML.load_file(scrape_patterns_filepath)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/news_scraper/errors.rb
CHANGED
@@ -1,14 +1,23 @@
|
|
1
1
|
|
2
2
|
module NewsScraper
|
3
|
-
class ResponseError < StandardError
|
3
|
+
class ResponseError < StandardError
|
4
|
+
attr_reader :error_code, :message, :url
|
5
|
+
|
6
|
+
def initialize(opts = {})
|
7
|
+
@error_code = opts[:error_code]
|
8
|
+
@message = opts[:message]
|
9
|
+
@url = opts[:url]
|
10
|
+
super
|
11
|
+
end
|
12
|
+
end
|
4
13
|
|
5
14
|
module Transformers
|
6
15
|
class ScrapePatternNotDefined < StandardError
|
7
|
-
attr_reader :root_domain, :
|
16
|
+
attr_reader :root_domain, :url
|
8
17
|
|
9
18
|
def initialize(opts = {})
|
10
19
|
@root_domain = opts[:root_domain]
|
11
|
-
@
|
20
|
+
@url = opts[:url]
|
12
21
|
super
|
13
22
|
end
|
14
23
|
end
|
@@ -10,9 +10,13 @@ module NewsScraper
|
|
10
10
|
|
11
11
|
CLI.put_header(url)
|
12
12
|
CLI.log "Beginning HTTP request for #{url}"
|
13
|
-
response = HTTParty.get(url)
|
13
|
+
response = HTTParty.get(url, headers: { "User-Agent" => "news-scraper-#{NewsScraper::VERSION}" })
|
14
14
|
|
15
|
-
raise ResponseError.new(
|
15
|
+
raise ResponseError.new(
|
16
|
+
error_code: response.code,
|
17
|
+
message: response.message,
|
18
|
+
url: url
|
19
|
+
) unless response.code == 200
|
16
20
|
|
17
21
|
CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
|
18
22
|
CLI.put_footer
|
data/lib/news_scraper/scraper.rb
CHANGED
@@ -16,6 +16,7 @@ module NewsScraper
|
|
16
16
|
#
|
17
17
|
# *Raises*
|
18
18
|
# - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
|
19
|
+
# - Will <code>yield</code> the error if a block is given
|
19
20
|
# - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
|
20
21
|
# - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
|
21
22
|
# - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
|
@@ -27,13 +28,19 @@ module NewsScraper
|
|
27
28
|
article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
|
28
29
|
|
29
30
|
transformed_articles = []
|
31
|
+
|
30
32
|
article_urls.each do |article_url|
|
31
33
|
payload = Extractors::Article.new(url: article_url).extract
|
34
|
+
article_transformer = Transformers::Article.new(url: article_url, payload: payload)
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
begin
|
37
|
+
transformed_article = article_transformer.transform
|
38
|
+
transformed_articles << transformed_article
|
39
|
+
yield transformed_article if block_given?
|
40
|
+
rescue Transformers::ScrapePatternNotDefined => e
|
41
|
+
raise e unless block_given?
|
42
|
+
yield e
|
43
|
+
end
|
37
44
|
end
|
38
45
|
|
39
46
|
transformed_articles
|
@@ -3,18 +3,16 @@ module NewsScraper
|
|
3
3
|
class PresetSelector
|
4
4
|
PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
|
5
5
|
|
6
|
-
def initialize(
|
6
|
+
def initialize(url:, payload:)
|
7
7
|
@url = url
|
8
8
|
@payload = payload
|
9
|
-
@data_type_presets = data_type_presets
|
10
|
-
@data_type = data_type
|
11
9
|
end
|
12
10
|
|
13
|
-
def select
|
14
|
-
|
11
|
+
def select(data_type)
|
12
|
+
pattern_options = pattern_options(data_type)
|
15
13
|
|
16
14
|
selected_option = CLI.prompt_with_options(
|
17
|
-
"Select which preset to use for #{
|
15
|
+
"Select which preset to use for #{data_type}:",
|
18
16
|
pattern_options.keys
|
19
17
|
)
|
20
18
|
|
@@ -27,50 +25,42 @@ module NewsScraper
|
|
27
25
|
end
|
28
26
|
return if selected_option == 'skip'
|
29
27
|
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
selected_preset_code = pattern_options[selected_option]
|
29
|
+
result = transform_results[data_type][selected_preset_code].merge(
|
30
|
+
'variable' => [selected_preset_code, data_type].join('_')
|
31
|
+
)
|
32
|
+
result.delete('data')
|
33
|
+
result
|
33
34
|
end
|
34
35
|
|
35
36
|
private
|
36
37
|
|
37
|
-
def pattern_options
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
%w(xpath css).each do |pattern_provider|
|
47
|
-
temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
|
38
|
+
def pattern_options(data_type)
|
39
|
+
# Add valid options from the transformed results
|
40
|
+
options = transform_results[data_type].each_with_object({}) do |(option, details), valid_options|
|
41
|
+
next unless details['data'] && !details['data'].empty?
|
42
|
+
table_key = Terminal::Table.new do |t|
|
43
|
+
t << ['method', details['method']]
|
44
|
+
t << ['pattern', details['pattern']]
|
45
|
+
t << ['data', details['data']]
|
48
46
|
end
|
49
|
-
|
47
|
+
valid_options["\n#{table_key}"] = option
|
50
48
|
end
|
51
|
-
end
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
@results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
|
58
|
-
scrape_details[@data_type] = preset_details
|
59
|
-
train_transformer = Transformers::TrainerArticle.new(
|
60
|
-
url: @url,
|
61
|
-
payload: @payload,
|
62
|
-
scrape_details: scrape_details,
|
63
|
-
)
|
50
|
+
# Add in options to customize the pattern
|
51
|
+
%w(xpath css).each do |pattern_provider|
|
52
|
+
options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
|
53
|
+
end
|
64
54
|
|
65
|
-
|
66
|
-
|
67
|
-
end.to_a
|
55
|
+
# Add option in to skip
|
56
|
+
options.merge('skip' => 'skip')
|
68
57
|
end
|
69
58
|
|
70
|
-
def
|
71
|
-
@
|
72
|
-
|
73
|
-
|
59
|
+
def transform_results
|
60
|
+
@transform_results ||= Transformers::TrainerArticle.new(
|
61
|
+
url: @url,
|
62
|
+
payload: @payload
|
63
|
+
).transform
|
74
64
|
end
|
75
65
|
end
|
76
66
|
end
|
@@ -8,18 +8,17 @@ module NewsScraper
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def train
|
11
|
-
return if
|
11
|
+
return if NewsScraper.configuration.scrape_patterns['domains'].key?(@root_domain)
|
12
12
|
|
13
13
|
CLI.put_header(@root_domain)
|
14
|
-
CLI.log("There is no scrape pattern defined for #{@root_domain}
|
14
|
+
CLI.log("There is no scrape pattern defined for #{@root_domain}")
|
15
15
|
CLI.log "Fetching information..."
|
16
16
|
CLI.put_footer
|
17
17
|
|
18
18
|
selected_presets = {}
|
19
|
-
|
19
|
+
NewsScraper.configuration.scrape_patterns['data_types'].each do |data_type|
|
20
20
|
selected_presets[data_type] = selected_pattern(data_type)
|
21
21
|
end
|
22
|
-
|
23
22
|
save_selected_presets(selected_presets)
|
24
23
|
end
|
25
24
|
|
@@ -27,29 +26,29 @@ module NewsScraper
|
|
27
26
|
|
28
27
|
def selected_pattern(data_type)
|
29
28
|
CLI.put_header("Determining information for #{data_type}")
|
30
|
-
|
31
|
-
pattern = if data_type_presets.nil?
|
29
|
+
pattern = if NewsScraper.configuration.scrape_patterns['presets'][data_type].nil?
|
32
30
|
CLI.log("No presets were found for #{data_type}. Skipping to next.")
|
33
31
|
nil
|
34
32
|
else
|
35
|
-
|
36
|
-
url: @url,
|
37
|
-
payload: @payload,
|
38
|
-
data_type_presets: data_type_presets,
|
39
|
-
data_type: data_type
|
40
|
-
).select
|
33
|
+
preset_selector.select(data_type)
|
41
34
|
end
|
42
35
|
CLI.put_footer
|
43
|
-
|
44
36
|
pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
|
45
37
|
end
|
46
38
|
|
39
|
+
def preset_selector
|
40
|
+
@preset_selector ||= PresetSelector.new(url: @url, payload: @payload)
|
41
|
+
end
|
42
|
+
|
47
43
|
def save_selected_presets(selected_presets)
|
48
|
-
|
44
|
+
return unless NewsScraper.configuration.scrape_patterns_filepath
|
45
|
+
|
46
|
+
current_content = File.read(NewsScraper.configuration.scrape_patterns_filepath).chomp
|
49
47
|
new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
|
50
48
|
|
51
|
-
File.write(
|
52
|
-
CLI.log("Successfully wrote presets for #{@root_domain} to
|
49
|
+
File.write(NewsScraper.configuration.scrape_patterns_filepath, new_content)
|
50
|
+
CLI.log("Successfully wrote presets for #{@root_domain} to"\
|
51
|
+
" #{NewsScraper.configuration.scrape_patterns_filepath}.")
|
53
52
|
end
|
54
53
|
|
55
54
|
def build_domain_yaml(selected_presets)
|
@@ -65,10 +64,6 @@ module NewsScraper
|
|
65
64
|
end
|
66
65
|
output_string.join("\n")
|
67
66
|
end
|
68
|
-
|
69
|
-
def article_scrape_patterns
|
70
|
-
@article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
|
71
|
-
end
|
72
67
|
end
|
73
68
|
end
|
74
69
|
end
|
@@ -2,6 +2,7 @@ require 'nokogiri'
|
|
2
2
|
require 'sanitize'
|
3
3
|
require 'readability'
|
4
4
|
require 'htmlbeautifier'
|
5
|
+
require 'news_scraper/transformers/nokogiri/functions'
|
5
6
|
|
6
7
|
module NewsScraper
|
7
8
|
module Transformers
|
@@ -13,9 +14,8 @@ module NewsScraper
|
|
13
14
|
# - <code>payload</code>: keyword arg - the result of the scrape
|
14
15
|
#
|
15
16
|
def initialize(url:, payload:)
|
16
|
-
|
17
|
-
@
|
18
|
-
@root_domain = uri_parser.host
|
17
|
+
@url = url
|
18
|
+
@root_domain = URIParser.new(url).host
|
19
19
|
@payload = payload
|
20
20
|
end
|
21
21
|
|
@@ -28,37 +28,36 @@ module NewsScraper
|
|
28
28
|
# - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
|
29
29
|
#
|
30
30
|
def transform
|
31
|
-
|
32
|
-
|
33
|
-
transformed_response.merge(
|
31
|
+
scrape_details = NewsScraper.configuration.scrape_patterns['domains'][@root_domain]
|
32
|
+
raise ScrapePatternNotDefined.new(url: @url, root_domain: @root_domain) unless scrape_details
|
33
|
+
transformed_response(scrape_details).merge(url: @url, root_domain: @root_domain)
|
34
34
|
end
|
35
35
|
|
36
36
|
private
|
37
37
|
|
38
|
-
def scrape_details
|
39
|
-
|
40
|
-
|
38
|
+
def transformed_response(scrape_details)
|
39
|
+
NewsScraper.configuration.scrape_patterns['data_types'].each_with_object({}) do |data_type, response|
|
40
|
+
response[data_type.to_sym] = nil
|
41
|
+
next unless scrape_details[data_type]
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
response[data_type.to_sym] = parsed_data(
|
44
|
+
scrape_details[data_type]['method'].to_sym,
|
45
|
+
scrape_details[data_type]['pattern']
|
46
|
+
)
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
|
-
def parsed_data(
|
49
|
-
return nil unless scrape_details[data_type]
|
50
|
-
|
51
|
-
scrape_method = scrape_details[data_type]['method'].to_sym
|
50
|
+
def parsed_data(scrape_method, scrape_pattern)
|
52
51
|
case scrape_method
|
53
52
|
when :xpath
|
54
|
-
noko_html = Nokogiri::HTML(@payload)
|
53
|
+
noko_html = ::Nokogiri::HTML(@payload)
|
55
54
|
Sanitize.fragment(
|
56
|
-
noko_html.
|
55
|
+
noko_html.xpath("(#{scrape_pattern})[1]", Nokogiri::Functions.new)
|
57
56
|
).squish
|
58
57
|
when :css
|
59
|
-
noko_html = Nokogiri::HTML(@payload)
|
58
|
+
noko_html = ::Nokogiri::HTML(@payload)
|
60
59
|
Sanitize.fragment(
|
61
|
-
noko_html.
|
60
|
+
noko_html.css(scrape_pattern)
|
62
61
|
).squish
|
63
62
|
when :readability
|
64
63
|
content = Readability::Document.new(
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module NewsScraper
|
4
|
+
module Transformers
|
5
|
+
module Nokogiri
|
6
|
+
class Functions
|
7
|
+
# Implements fn:string-join of XPath 2.0
|
8
|
+
def string_join(nodeset, separator)
|
9
|
+
nodeset.map(&:text).join(separator)
|
10
|
+
end
|
11
|
+
alias_method :'string-join', :string_join
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -6,12 +6,26 @@ module NewsScraper
|
|
6
6
|
# *Params*
|
7
7
|
# - <code>url</code>: keyword arg - the url on which scraping was done
|
8
8
|
# - <code>payload</code>: keyword arg - the result of the scrape
|
9
|
-
# - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
|
10
9
|
#
|
11
|
-
def initialize(url:, payload
|
12
|
-
@scrape_details = scrape_details
|
10
|
+
def initialize(url:, payload:)
|
13
11
|
super(url: url, payload: payload)
|
14
12
|
end
|
13
|
+
|
14
|
+
# Transform the article
|
15
|
+
#
|
16
|
+
# *Returns*
|
17
|
+
# - <code>transformed_response</code>: tries all possible presets and returns a hash representing the results
|
18
|
+
#
|
19
|
+
def transform
|
20
|
+
presets = NewsScraper.configuration.scrape_patterns['presets']
|
21
|
+
transformed_response = presets.each_with_object({}) do |(data_type, preset_options), response|
|
22
|
+
response[data_type] = preset_options.each_with_object({}) do |(option, scrape_details), data_type_options|
|
23
|
+
data = parsed_data(scrape_details['method'].to_sym, scrape_details['pattern'])
|
24
|
+
data_type_options[option] = scrape_details.merge('data' => data)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
transformed_response.merge('url' => @url, 'root_domain' => @root_domain)
|
28
|
+
end
|
15
29
|
end
|
16
30
|
end
|
17
31
|
end
|
data/lib/news_scraper/version.rb
CHANGED
data/lib/news_scraper.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'yaml'
|
3
|
+
require 'terminal-table'
|
3
4
|
|
4
|
-
require 'news_scraper/
|
5
|
+
require 'news_scraper/configuration'
|
5
6
|
require 'news_scraper/uri_parser'
|
6
7
|
require 'news_scraper/active_support_lite/string'
|
7
8
|
|
@@ -23,11 +24,12 @@ require 'news_scraper/trainer'
|
|
23
24
|
|
24
25
|
module NewsScraper
|
25
26
|
extend self
|
27
|
+
attr_writer :configuration
|
26
28
|
|
27
29
|
# <code>NewsScraper::train</code> is an interactive command-line prompt that:
|
28
30
|
#
|
29
31
|
# 1. Collates all articles for the given :query
|
30
|
-
# 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>
|
32
|
+
# 2. Grep for <code>:data_types</code> using <code>:presets</code> in the config set in the <code>configuration</code>
|
31
33
|
# 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
|
32
34
|
# 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
|
33
35
|
# N.B: User may ignore all presets and manually configure it in the YAML file
|
@@ -36,7 +38,21 @@ module NewsScraper
|
|
36
38
|
# *Params*
|
37
39
|
# - <code>query</code>: a keyword arugment specifying the query to train on
|
38
40
|
#
|
41
|
+
# :nocov:
|
39
42
|
def train(query:)
|
40
43
|
Trainer.train(query: query)
|
41
44
|
end
|
45
|
+
# :nocov:
|
46
|
+
|
47
|
+
def configuration
|
48
|
+
@configuration ||= Configuration.new
|
49
|
+
end
|
50
|
+
|
51
|
+
def reset_configuration
|
52
|
+
@configuration = Configuration.new
|
53
|
+
end
|
54
|
+
|
55
|
+
def configure
|
56
|
+
yield(configuration)
|
57
|
+
end
|
42
58
|
end
|
data/news_scraper.gemspec
CHANGED
@@ -29,6 +29,7 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
|
30
30
|
spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
|
31
31
|
spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
|
32
|
+
spec.add_dependency 'terminal-table', '~> 1.5', '>= 1.5.2'
|
32
33
|
|
33
34
|
spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
|
34
35
|
spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
|
@@ -38,4 +39,5 @@ Gem::Specification.new do |spec|
|
|
38
39
|
spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
|
39
40
|
spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
|
40
41
|
spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
|
42
|
+
spec.add_development_dependency 'simplecov', '~> 0.12.0'
|
41
43
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Richard Wu
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-09-
|
12
|
+
date: 2016-09-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -111,6 +111,26 @@ dependencies:
|
|
111
111
|
- - ">="
|
112
112
|
- !ruby/object:Gem::Version
|
113
113
|
version: 1.1.1
|
114
|
+
- !ruby/object:Gem::Dependency
|
115
|
+
name: terminal-table
|
116
|
+
requirement: !ruby/object:Gem::Requirement
|
117
|
+
requirements:
|
118
|
+
- - "~>"
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '1.5'
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: 1.5.2
|
124
|
+
type: :runtime
|
125
|
+
prerelease: false
|
126
|
+
version_requirements: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '1.5'
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: 1.5.2
|
114
134
|
- !ruby/object:Gem::Dependency
|
115
135
|
name: bundler
|
116
136
|
requirement: !ruby/object:Gem::Requirement
|
@@ -271,6 +291,20 @@ dependencies:
|
|
271
291
|
- - ">="
|
272
292
|
- !ruby/object:Gem::Version
|
273
293
|
version: 4.2.2
|
294
|
+
- !ruby/object:Gem::Dependency
|
295
|
+
name: simplecov
|
296
|
+
requirement: !ruby/object:Gem::Requirement
|
297
|
+
requirements:
|
298
|
+
- - "~>"
|
299
|
+
- !ruby/object:Gem::Version
|
300
|
+
version: 0.12.0
|
301
|
+
type: :development
|
302
|
+
prerelease: false
|
303
|
+
version_requirements: !ruby/object:Gem::Requirement
|
304
|
+
requirements:
|
305
|
+
- - "~>"
|
306
|
+
- !ruby/object:Gem::Version
|
307
|
+
version: 0.12.0
|
274
308
|
description: A collection of extractors, transformers and loaders for scraping news
|
275
309
|
websites and syndicates.
|
276
310
|
email:
|
@@ -296,7 +330,7 @@ files:
|
|
296
330
|
- lib/news_scraper.rb
|
297
331
|
- lib/news_scraper/active_support_lite/string.rb
|
298
332
|
- lib/news_scraper/cli.rb
|
299
|
-
- lib/news_scraper/
|
333
|
+
- lib/news_scraper/configuration.rb
|
300
334
|
- lib/news_scraper/errors.rb
|
301
335
|
- lib/news_scraper/extractors/article.rb
|
302
336
|
- lib/news_scraper/extractors/google_news_rss.rb
|
@@ -306,6 +340,7 @@ files:
|
|
306
340
|
- lib/news_scraper/trainer/preset_selector.rb
|
307
341
|
- lib/news_scraper/trainer/url_trainer.rb
|
308
342
|
- lib/news_scraper/transformers/article.rb
|
343
|
+
- lib/news_scraper/transformers/nokogiri/functions.rb
|
309
344
|
- lib/news_scraper/transformers/trainer_article.rb
|
310
345
|
- lib/news_scraper/uri_parser.rb
|
311
346
|
- lib/news_scraper/version.rb
|