news_scraper 0.1.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e571db5d64bc7679b4b061981208026da6878101
4
- data.tar.gz: 4009ca8b1571847afc1756eb5678e634feb0ba54
3
+ metadata.gz: db7d631f3f6cf73ff2e57b9e472804651b9fe1e0
4
+ data.tar.gz: 1045878eb97749d6b264a486ac34bfb89f4796dd
5
5
  SHA512:
6
- metadata.gz: fcb8a793e259163dec3bcdf260bdd4accb3275f11f1ae5df05ce0b40ff3708bf9113be3c8021846ed4f0497e24f5bddffc4165408fd3ef4e57c881f5a6d8d501
7
- data.tar.gz: be27fea65bc420052cdae19f7ae38135565fed7ff8730e5bdf4445058af34a30bbfed006c5cb5f901b6688020d892c70a322adaa5d6d4c5270521746e12d6fa6
6
+ metadata.gz: a53423be5dbda33ead7dbb46bc494e40fcf412172a291496128b40512c985fc157c646481e1b8a183be2f709486e2cc7ec27d47a17bb9715c3b19dbe09dd7e42
7
+ data.tar.gz: eb43f129a0ca1a9f6eb02f24bfeb891583c96b0cce548afe43cf613231e8908a66e381347aeb16f5be9ea4af3ca96f82dcaf4da005951fb5ae5936d3f3530cbc
data/.gitignore CHANGED
@@ -8,3 +8,4 @@
8
8
  /spec/reports/
9
9
  /tmp/
10
10
  *.gem
11
+ coverage
data/.rubocop.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  AllCops:
2
- TargetRubyVersion: 2.3
2
+ TargetRubyVersion: 2.2
3
3
 
4
4
  ClassLength:
5
5
  Max: 500
data/README.md CHANGED
@@ -50,7 +50,7 @@ How the `Scraper` extracts and parses for the information is determined by scrap
50
50
 
51
51
  Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
52
52
 
53
- In addition, the `uri` and `root_domain`(hostname) of the article will be returned in the hash too.
53
+ In addition, the `url` and `root_domain`(hostname) of the article will be returned in the hash too.
54
54
 
55
55
  Example
56
56
  ```
@@ -62,7 +62,7 @@ Example
62
62
  section: 'technology',
63
63
  datetime: '1991-10-05T12:00:00+00:00',
64
64
  title: 'Linus Linux',
65
- uri: 'linusworld.com/the-linux-kernel',
65
+ url: 'https://linusworld.com/the-linux-kernel',
66
66
  root_domain: 'linusworld.com'
67
67
  }
68
68
  ```
@@ -52,10 +52,16 @@ presets:
52
52
  article_tag: &article_tag_keywords
53
53
  method: "xpath"
54
54
  pattern: "//meta[@property='article:tag']/@content"
55
+ news_keywords: &news_keywords_keywords
56
+ method: "xpath"
57
+ pattern: "//meta[@name='news_keywords']/@content"
55
58
  section:
56
59
  meta: &meta_section
57
60
  method: "xpath"
58
61
  pattern: "//meta[@property='article:section']/@content"
62
+ section: &section_section
63
+ method: "xpath"
64
+ pattern: "//meta[@name='section']/@content"
59
65
  datetime:
60
66
  article_date_original: &article_date_original_datetime
61
67
  method: xpath
@@ -87,6 +93,15 @@ presets:
87
93
  sailthru_date: &sailthru_date_datetime
88
94
  method: xpath
89
95
  pattern: //meta[@name='sailthru.date']/@content
96
+ time: &time_datetime
97
+ method: xpath
98
+ pattern: //time/@datetime
99
+ date_published_datetime: &date_published_datetime_datetime
100
+ method: xpath
101
+ pattern: //meta[@itemprop="datePublished"]/@datetime
102
+ date_published_content: &date_published_content_datetime
103
+ method: xpath
104
+ pattern: //meta[@itemprop="datePublished"]/@content
90
105
  title:
91
106
  html: &html_title
92
107
  method: "xpath"
@@ -6,14 +6,13 @@ module NewsScraper
6
6
 
7
7
  DEFAULT_COLOR = "\x1b[36m".freeze
8
8
 
9
- def log(message, color: DEFAULT_COLOR, new_line: false)
10
- message += "\n" if new_line
9
+ def log(message, color: DEFAULT_COLOR)
11
10
  $stdout.puts "#{color}┃\x1b[0m " + message
12
11
  end
13
12
 
14
- def log_lines(message, color: DEFAULT_COLOR, new_line: false)
13
+ def log_lines(message, color: DEFAULT_COLOR)
15
14
  message.split("\n").each do |line|
16
- log(line, color: color, new_line: new_line)
15
+ log(line, color: color)
17
16
  end
18
17
  end
19
18
 
@@ -49,8 +48,8 @@ module NewsScraper
49
48
  buf = -1
50
49
  available = (1..options.length).to_a
51
50
  until available.include?(buf.to_i)
52
- begin
53
- buf = Readline.readline("\x1b[34m┃ > \x1b[33m", true)
51
+ buf = begin
52
+ Readline.readline("\x1b[34m┃ > \x1b[33m", true)
54
53
  rescue Interrupt
55
54
  nil
56
55
  end
@@ -71,14 +70,16 @@ module NewsScraper
71
70
 
72
71
  ## Fancy Headers and Footers
73
72
 
74
- def put_header(text = "", color = DEFAULT_COLOR)
73
+ def put_header(text = "", color: DEFAULT_COLOR)
75
74
  put_edge(color, "┏━━ ", text)
76
75
  end
77
76
 
78
- def put_footer(color = DEFAULT_COLOR)
77
+ def put_footer(color: DEFAULT_COLOR)
79
78
  put_edge(color, "┗", "")
80
79
  end
81
80
 
81
+ private
82
+
82
83
  def put_edge(color, prefix, text)
83
84
  ptext = "#{color}#{prefix}#{text}"
84
85
  textwidth = printing_width(ptext)
@@ -0,0 +1,33 @@
1
+ module NewsScraper
2
+ class Configuration
3
+ DEFAULT_SCRAPE_PATTERNS_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
4
+ attr_accessor :fetch_method, :scrape_patterns_filepath
5
+
6
+ # <code>NewsScraper::Configuration.initialize</code> initializes the scrape_patterns_filepath
7
+ # and the fetch_method to the <code>DEFAULT_SCRAPE_PATTERNS_FILEPATH</code>
8
+ #
9
+ # Set the <code>scrape_patterns_filepath</code> to <code>nil</code> to disable saving during training
10
+ #
11
+ def initialize
12
+ self.scrape_patterns_filepath = DEFAULT_SCRAPE_PATTERNS_FILEPATH
13
+ self.fetch_method = proc { default_scrape_patterns }
14
+ end
15
+
16
+ # <code>NewsScraper::Configuration.scrape_patterns</code> proxies scrape_patterns
17
+ # requests to <code>fetch_method</code>:
18
+ #
19
+ # *Returns*
20
+ # - The result of calling the <code>fetch_method</code> proc, expected to be a hash
21
+ #
22
+ def scrape_patterns
23
+ fetch_method.call
24
+ end
25
+
26
+ private
27
+
28
+ def default_scrape_patterns
29
+ @default_scrape_patterns ||= {}
30
+ @default_scrape_patterns[scrape_patterns_filepath] ||= YAML.load_file(scrape_patterns_filepath)
31
+ end
32
+ end
33
+ end
@@ -1,14 +1,23 @@
1
1
 
2
2
  module NewsScraper
3
- class ResponseError < StandardError; end
3
+ class ResponseError < StandardError
4
+ attr_reader :error_code, :message, :url
5
+
6
+ def initialize(opts = {})
7
+ @error_code = opts[:error_code]
8
+ @message = opts[:message]
9
+ @url = opts[:url]
10
+ super
11
+ end
12
+ end
4
13
 
5
14
  module Transformers
6
15
  class ScrapePatternNotDefined < StandardError
7
- attr_reader :root_domain, :uri
16
+ attr_reader :root_domain, :url
8
17
 
9
18
  def initialize(opts = {})
10
19
  @root_domain = opts[:root_domain]
11
- @uri = opts[:uri]
20
+ @url = opts[:url]
12
21
  super
13
22
  end
14
23
  end
@@ -10,9 +10,13 @@ module NewsScraper
10
10
 
11
11
  CLI.put_header(url)
12
12
  CLI.log "Beginning HTTP request for #{url}"
13
- response = HTTParty.get(url)
13
+ response = HTTParty.get(url, headers: { "User-Agent" => "news-scraper-#{NewsScraper::VERSION}" })
14
14
 
15
- raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
15
+ raise ResponseError.new(
16
+ error_code: response.code,
17
+ message: response.message,
18
+ url: url
19
+ ) unless response.code == 200
16
20
 
17
21
  CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
18
22
  CLI.put_footer
@@ -16,6 +16,7 @@ module NewsScraper
16
16
  #
17
17
  # *Raises*
18
18
  # - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
19
+ # - Will <code>yield</code> the error if a block is given
19
20
  # - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
20
21
  # - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
21
22
  # - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
@@ -27,13 +28,19 @@ module NewsScraper
27
28
  article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
28
29
 
29
30
  transformed_articles = []
31
+
30
32
  article_urls.each do |article_url|
31
33
  payload = Extractors::Article.new(url: article_url).extract
34
+ article_transformer = Transformers::Article.new(url: article_url, payload: payload)
32
35
 
33
- transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
34
- transformed_articles << transformed_article
35
-
36
- yield transformed_article if block_given?
36
+ begin
37
+ transformed_article = article_transformer.transform
38
+ transformed_articles << transformed_article
39
+ yield transformed_article if block_given?
40
+ rescue Transformers::ScrapePatternNotDefined => e
41
+ raise e unless block_given?
42
+ yield e
43
+ end
37
44
  end
38
45
 
39
46
  transformed_articles
@@ -3,18 +3,16 @@ module NewsScraper
3
3
  class PresetSelector
4
4
  PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
5
5
 
6
- def initialize(data_type:, data_type_presets:, url:, payload:)
6
+ def initialize(url:, payload:)
7
7
  @url = url
8
8
  @payload = payload
9
- @data_type_presets = data_type_presets
10
- @data_type = data_type
11
9
  end
12
10
 
13
- def select
14
- return unless @data_type_presets
11
+ def select(data_type)
12
+ pattern_options = pattern_options(data_type)
15
13
 
16
14
  selected_option = CLI.prompt_with_options(
17
- "Select which preset to use for #{@data_type}:",
15
+ "Select which preset to use for #{data_type}:",
18
16
  pattern_options.keys
19
17
  )
20
18
 
@@ -27,50 +25,42 @@ module NewsScraper
27
25
  end
28
26
  return if selected_option == 'skip'
29
27
 
30
- selected_index = pattern_options[selected_option]
31
- selected_preset_code = transform_results[selected_index].first
32
- @data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
28
+ selected_preset_code = pattern_options[selected_option]
29
+ result = transform_results[data_type][selected_preset_code].merge(
30
+ 'variable' => [selected_preset_code, data_type].join('_')
31
+ )
32
+ result.delete('data')
33
+ result
33
34
  end
34
35
 
35
36
  private
36
37
 
37
- def pattern_options
38
- return {} unless @data_type_presets
39
-
40
- @pattern_options ||= begin
41
- temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
42
- preset_name = "#{results[0]}_#{@data_type}"
43
- extracted_text = results[1]
44
- options_hash["#{preset_name}: #{extracted_text}"] = index
45
- end
46
- %w(xpath css).each do |pattern_provider|
47
- temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
38
+ def pattern_options(data_type)
39
+ # Add valid options from the transformed results
40
+ options = transform_results[data_type].each_with_object({}) do |(option, details), valid_options|
41
+ next unless details['data'] && !details['data'].empty?
42
+ table_key = Terminal::Table.new do |t|
43
+ t << ['method', details['method']]
44
+ t << ['pattern', details['pattern']]
45
+ t << ['data', details['data']]
48
46
  end
49
- temp_options.merge('skip' => 'skip')
47
+ valid_options["\n#{table_key}"] = option
50
48
  end
51
- end
52
49
 
53
- def transform_results
54
- return {} unless @data_type_presets
55
-
56
- scrape_details = blank_scrape_details
57
- @results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
58
- scrape_details[@data_type] = preset_details
59
- train_transformer = Transformers::TrainerArticle.new(
60
- url: @url,
61
- payload: @payload,
62
- scrape_details: scrape_details,
63
- )
50
+ # Add in options to customize the pattern
51
+ %w(xpath css).each do |pattern_provider|
52
+ options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
53
+ end
64
54
 
65
- transformed_result = train_transformer.transform[@data_type.to_sym]
66
- hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
67
- end.to_a
55
+ # Add option in to skip
56
+ options.merge('skip' => 'skip')
68
57
  end
69
58
 
70
- def blank_scrape_details
71
- @blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
72
- hash[data_type] = nil
73
- end
59
+ def transform_results
60
+ @transform_results ||= Transformers::TrainerArticle.new(
61
+ url: @url,
62
+ payload: @payload
63
+ ).transform
74
64
  end
75
65
  end
76
66
  end
@@ -8,18 +8,17 @@ module NewsScraper
8
8
  end
9
9
 
10
10
  def train
11
- return if article_scrape_patterns['domains'].key?(@root_domain)
11
+ return if NewsScraper.configuration.scrape_patterns['domains'].key?(@root_domain)
12
12
 
13
13
  CLI.put_header(@root_domain)
14
- CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
14
+ CLI.log("There is no scrape pattern defined for #{@root_domain}")
15
15
  CLI.log "Fetching information..."
16
16
  CLI.put_footer
17
17
 
18
18
  selected_presets = {}
19
- article_scrape_patterns['data_types'].each do |data_type|
19
+ NewsScraper.configuration.scrape_patterns['data_types'].each do |data_type|
20
20
  selected_presets[data_type] = selected_pattern(data_type)
21
21
  end
22
-
23
22
  save_selected_presets(selected_presets)
24
23
  end
25
24
 
@@ -27,29 +26,29 @@ module NewsScraper
27
26
 
28
27
  def selected_pattern(data_type)
29
28
  CLI.put_header("Determining information for #{data_type}")
30
- data_type_presets = article_scrape_patterns['presets'][data_type]
31
- pattern = if data_type_presets.nil?
29
+ pattern = if NewsScraper.configuration.scrape_patterns['presets'][data_type].nil?
32
30
  CLI.log("No presets were found for #{data_type}. Skipping to next.")
33
31
  nil
34
32
  else
35
- PresetSelector.new(
36
- url: @url,
37
- payload: @payload,
38
- data_type_presets: data_type_presets,
39
- data_type: data_type
40
- ).select
33
+ preset_selector.select(data_type)
41
34
  end
42
35
  CLI.put_footer
43
-
44
36
  pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
45
37
  end
46
38
 
39
+ def preset_selector
40
+ @preset_selector ||= PresetSelector.new(url: @url, payload: @payload)
41
+ end
42
+
47
43
  def save_selected_presets(selected_presets)
48
- current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
44
+ return unless NewsScraper.configuration.scrape_patterns_filepath
45
+
46
+ current_content = File.read(NewsScraper.configuration.scrape_patterns_filepath).chomp
49
47
  new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
50
48
 
51
- File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
52
- CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
49
+ File.write(NewsScraper.configuration.scrape_patterns_filepath, new_content)
50
+ CLI.log("Successfully wrote presets for #{@root_domain} to"\
51
+ " #{NewsScraper.configuration.scrape_patterns_filepath}.")
53
52
  end
54
53
 
55
54
  def build_domain_yaml(selected_presets)
@@ -65,10 +64,6 @@ module NewsScraper
65
64
  end
66
65
  output_string.join("\n")
67
66
  end
68
-
69
- def article_scrape_patterns
70
- @article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
71
- end
72
67
  end
73
68
  end
74
69
  end
@@ -2,6 +2,7 @@ require 'nokogiri'
2
2
  require 'sanitize'
3
3
  require 'readability'
4
4
  require 'htmlbeautifier'
5
+ require 'news_scraper/transformers/nokogiri/functions'
5
6
 
6
7
  module NewsScraper
7
8
  module Transformers
@@ -13,9 +14,8 @@ module NewsScraper
13
14
  # - <code>payload</code>: keyword arg - the result of the scrape
14
15
  #
15
16
  def initialize(url:, payload:)
16
- uri_parser = URIParser.new(url)
17
- @uri = uri_parser.without_scheme
18
- @root_domain = uri_parser.host
17
+ @url = url
18
+ @root_domain = URIParser.new(url).host
19
19
  @payload = payload
20
20
  end
21
21
 
@@ -28,37 +28,36 @@ module NewsScraper
28
28
  # - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
29
29
  #
30
30
  def transform
31
- raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
32
-
33
- transformed_response.merge(uri: @uri, root_domain: @root_domain)
31
+ scrape_details = NewsScraper.configuration.scrape_patterns['domains'][@root_domain]
32
+ raise ScrapePatternNotDefined.new(url: @url, root_domain: @root_domain) unless scrape_details
33
+ transformed_response(scrape_details).merge(url: @url, root_domain: @root_domain)
34
34
  end
35
35
 
36
36
  private
37
37
 
38
- def scrape_details
39
- @scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
40
- end
38
+ def transformed_response(scrape_details)
39
+ NewsScraper.configuration.scrape_patterns['data_types'].each_with_object({}) do |data_type, response|
40
+ response[data_type.to_sym] = nil
41
+ next unless scrape_details[data_type]
41
42
 
42
- def transformed_response
43
- Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
44
- response[data_type.to_sym] = parsed_data(data_type)
43
+ response[data_type.to_sym] = parsed_data(
44
+ scrape_details[data_type]['method'].to_sym,
45
+ scrape_details[data_type]['pattern']
46
+ )
45
47
  end
46
48
  end
47
49
 
48
- def parsed_data(data_type)
49
- return nil unless scrape_details[data_type]
50
-
51
- scrape_method = scrape_details[data_type]['method'].to_sym
50
+ def parsed_data(scrape_method, scrape_pattern)
52
51
  case scrape_method
53
52
  when :xpath
54
- noko_html = Nokogiri::HTML(@payload)
53
+ noko_html = ::Nokogiri::HTML(@payload)
55
54
  Sanitize.fragment(
56
- noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
55
+ noko_html.xpath("(#{scrape_pattern})[1]", Nokogiri::Functions.new)
57
56
  ).squish
58
57
  when :css
59
- noko_html = Nokogiri::HTML(@payload)
58
+ noko_html = ::Nokogiri::HTML(@payload)
60
59
  Sanitize.fragment(
61
- noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
60
+ noko_html.css(scrape_pattern)
62
61
  ).squish
63
62
  when :readability
64
63
  content = Readability::Document.new(
@@ -0,0 +1,15 @@
1
+ require 'nokogiri'
2
+
3
+ module NewsScraper
4
+ module Transformers
5
+ module Nokogiri
6
+ class Functions
7
+ # Implements fn:string-join of XPath 2.0
8
+ def string_join(nodeset, separator)
9
+ nodeset.map(&:text).join(separator)
10
+ end
11
+ alias_method :'string-join', :string_join
12
+ end
13
+ end
14
+ end
15
+ end
@@ -6,12 +6,26 @@ module NewsScraper
6
6
  # *Params*
7
7
  # - <code>url</code>: keyword arg - the url on which scraping was done
8
8
  # - <code>payload</code>: keyword arg - the result of the scrape
9
- # - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
10
9
  #
11
- def initialize(url:, payload:, scrape_details:)
12
- @scrape_details = scrape_details
10
+ def initialize(url:, payload:)
13
11
  super(url: url, payload: payload)
14
12
  end
13
+
14
+ # Transform the article
15
+ #
16
+ # *Returns*
17
+ # - <code>transformed_response</code>: tries all possible presets and returns a hash representing the results
18
+ #
19
+ def transform
20
+ presets = NewsScraper.configuration.scrape_patterns['presets']
21
+ transformed_response = presets.each_with_object({}) do |(data_type, preset_options), response|
22
+ response[data_type] = preset_options.each_with_object({}) do |(option, scrape_details), data_type_options|
23
+ data = parsed_data(scrape_details['method'].to_sym, scrape_details['pattern'])
24
+ data_type_options[option] = scrape_details.merge('data' => data)
25
+ end
26
+ end
27
+ transformed_response.merge('url' => @url, 'root_domain' => @root_domain)
28
+ end
15
29
  end
16
30
  end
17
31
  end
@@ -1,3 +1,3 @@
1
1
  module NewsScraper
2
- VERSION = "0.1.2".freeze
2
+ VERSION = "1.0.0".freeze
3
3
  end
data/lib/news_scraper.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'httparty'
2
2
  require 'yaml'
3
+ require 'terminal-table'
3
4
 
4
- require 'news_scraper/constants'
5
+ require 'news_scraper/configuration'
5
6
  require 'news_scraper/uri_parser'
6
7
  require 'news_scraper/active_support_lite/string'
7
8
 
@@ -23,11 +24,12 @@ require 'news_scraper/trainer'
23
24
 
24
25
  module NewsScraper
25
26
  extend self
27
+ attr_writer :configuration
26
28
 
27
29
  # <code>NewsScraper::train</code> is an interactive command-line prompt that:
28
30
  #
29
31
  # 1. Collates all articles for the given :query
30
- # 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
32
+ # 2. Grep for <code>:data_types</code> using <code>:presets</code> in the config set in the <code>configuration</code>
31
33
  # 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
32
34
  # 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
33
35
  # N.B: User may ignore all presets and manually configure it in the YAML file
@@ -36,7 +38,21 @@ module NewsScraper
36
38
  # *Params*
37
39
  # - <code>query</code>: a keyword arugment specifying the query to train on
38
40
  #
41
+ # :nocov:
39
42
  def train(query:)
40
43
  Trainer.train(query: query)
41
44
  end
45
+ # :nocov:
46
+
47
+ def configuration
48
+ @configuration ||= Configuration.new
49
+ end
50
+
51
+ def reset_configuration
52
+ @configuration = Configuration.new
53
+ end
54
+
55
+ def configure
56
+ yield(configuration)
57
+ end
42
58
  end
data/news_scraper.gemspec CHANGED
@@ -29,6 +29,7 @@ Gem::Specification.new do |spec|
29
29
  spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
30
30
  spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
31
31
  spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
32
+ spec.add_dependency 'terminal-table', '~> 1.5', '>= 1.5.2'
32
33
 
33
34
  spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
34
35
  spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
@@ -38,4 +39,5 @@ Gem::Specification.new do |spec|
38
39
  spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
39
40
  spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
40
41
  spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
42
+ spec.add_development_dependency 'simplecov', '~> 0.12.0'
41
43
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Wu
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2016-09-19 00:00:00.000000000 Z
12
+ date: 2016-09-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -111,6 +111,26 @@ dependencies:
111
111
  - - ">="
112
112
  - !ruby/object:Gem::Version
113
113
  version: 1.1.1
114
+ - !ruby/object:Gem::Dependency
115
+ name: terminal-table
116
+ requirement: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - "~>"
119
+ - !ruby/object:Gem::Version
120
+ version: '1.5'
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 1.5.2
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '1.5'
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: 1.5.2
114
134
  - !ruby/object:Gem::Dependency
115
135
  name: bundler
116
136
  requirement: !ruby/object:Gem::Requirement
@@ -271,6 +291,20 @@ dependencies:
271
291
  - - ">="
272
292
  - !ruby/object:Gem::Version
273
293
  version: 4.2.2
294
+ - !ruby/object:Gem::Dependency
295
+ name: simplecov
296
+ requirement: !ruby/object:Gem::Requirement
297
+ requirements:
298
+ - - "~>"
299
+ - !ruby/object:Gem::Version
300
+ version: 0.12.0
301
+ type: :development
302
+ prerelease: false
303
+ version_requirements: !ruby/object:Gem::Requirement
304
+ requirements:
305
+ - - "~>"
306
+ - !ruby/object:Gem::Version
307
+ version: 0.12.0
274
308
  description: A collection of extractors, transformers and loaders for scraping news
275
309
  websites and syndicates.
276
310
  email:
@@ -296,7 +330,7 @@ files:
296
330
  - lib/news_scraper.rb
297
331
  - lib/news_scraper/active_support_lite/string.rb
298
332
  - lib/news_scraper/cli.rb
299
- - lib/news_scraper/constants.rb
333
+ - lib/news_scraper/configuration.rb
300
334
  - lib/news_scraper/errors.rb
301
335
  - lib/news_scraper/extractors/article.rb
302
336
  - lib/news_scraper/extractors/google_news_rss.rb
@@ -306,6 +340,7 @@ files:
306
340
  - lib/news_scraper/trainer/preset_selector.rb
307
341
  - lib/news_scraper/trainer/url_trainer.rb
308
342
  - lib/news_scraper/transformers/article.rb
343
+ - lib/news_scraper/transformers/nokogiri/functions.rb
309
344
  - lib/news_scraper/transformers/trainer_article.rb
310
345
  - lib/news_scraper/uri_parser.rb
311
346
  - lib/news_scraper/version.rb
@@ -1,6 +0,0 @@
1
- module NewsScraper
2
- module Constants
3
- SCRAPE_PATTERN_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
4
- SCRAPE_PATTERNS = YAML.load_file(SCRAPE_PATTERN_FILEPATH)
5
- end
6
- end