news_scraper 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cadeecdf95b6d5fd907671773fc3cc521fe756a2
4
+ data.tar.gz: 150d667251c00f96b01b195c99fe2a44f3da8da9
5
+ SHA512:
6
+ metadata.gz: 8af8f251dce23589d5e08af5f9c0510d031b91ed56188a0e6b3418b26397ad9ba98835a516c8e174d8aed3ea791793e5ceec8c3422a6e7731ac84196b49f287b
7
+ data.tar.gz: d55a513397be97f08b0adcbc975db4ad1c39d0887576979b138a50b5cb3601eb03036a9cfe75d1b0d4dcd2ab0303256c9112c865749905e5540f35bdefef18eb
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.gem
data/.rubocop.yml ADDED
@@ -0,0 +1,96 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.3
3
+
4
+ ClassLength:
5
+ Max: 500
6
+
7
+ ModuleLength:
8
+ Max: 500
9
+
10
+ Rails:
11
+ Enabled: false
12
+
13
+ Lint/AssignmentInCondition:
14
+ Enabled: false
15
+
16
+ Style/Documentation:
17
+ Enabled: false
18
+
19
+ Style/MultilineOperationIndentation:
20
+ Enabled: true
21
+
22
+ Style/AlignParameters:
23
+ EnforcedStyle: with_fixed_indentation
24
+
25
+ Style/FirstParameterIndentation:
26
+ EnforcedStyle: consistent
27
+
28
+ Style/TrailingCommaInLiteral:
29
+ Enabled: false
30
+
31
+ Style/TrailingCommaInArguments:
32
+ Enabled: false
33
+
34
+ Style/SignalException:
35
+ EnforcedStyle: only_raise
36
+
37
+ Style/NumericLiterals:
38
+ Enabled: true
39
+
40
+ Style/CaseIndentation:
41
+ IndentWhenRelativeTo: end
42
+
43
+ Style/IndentHash:
44
+ EnforcedStyle: consistent
45
+
46
+ Style/WordArray:
47
+ Enabled: true
48
+
49
+ Style/ModuleFunction:
50
+ Enabled: false
51
+
52
+ Style/RaiseArgs:
53
+ EnforcedStyle: compact
54
+
55
+ Metrics/AbcSize:
56
+ Enabled: false
57
+
58
+ Metrics/CyclomaticComplexity:
59
+ Enabled: false
60
+
61
+ Style/StringLiterals:
62
+ Enabled: false
63
+
64
+ Metrics/LineLength:
65
+ Max: 120
66
+
67
+ Metrics/ClassLength:
68
+ Enabled: false
69
+
70
+ Metrics/MethodLength:
71
+ Enabled: false
72
+
73
+ Metrics/ParameterLists:
74
+ Max: 5
75
+ CountKeywordArgs: false
76
+
77
+ Metrics/PerceivedComplexity:
78
+ Enabled: false
79
+
80
+ Lint/EndAlignment:
81
+ AlignWith: variable
82
+
83
+ Style/FrozenStringLiteralComment:
84
+ Enabled: false
85
+
86
+ Style/Alias:
87
+ EnforcedStyle: prefer_alias_method
88
+
89
+ Style/MutableConstant:
90
+ Enabled: true
91
+
92
+ Performance/Casecmp:
93
+ Enabled: true
94
+
95
+ Style/GuardClause:
96
+ Enabled: true
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at richardwu1997@gmail.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Richard Wu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # NewsScraper
2
+
3
+ ### Simple ETL news scraper in Ruby
4
+
5
+ [RubyGems](https://rubygems.org/gems/news_scraper)
6
+
7
+ A collection of extractors, transformers and loaders for a variety of news feeds and outlets.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'news_scraper'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install news_scraper
24
+
25
+ ## Usage
26
+
27
+ ### Scraping
28
+
29
+ `NewsScraper::Scraper#scrape` will return an array of the transformed data for all Google News RSS articles for the given query.
30
+
31
+ Optionally, you can pass in a block and it will yield the transformed data on a per-article basis.
32
+
33
+ It takes in 1 parameter `query:`.
34
+
35
+ Array notation
36
+ ```
37
+ article_hashes = NewsScraper::Scraper.new(query: 'Shopify').scrape # [ { author: ... }, { author: ... } ... ]
38
+ ```
39
+
40
+ Block notation
41
+ ```
42
+ NewsScraper::Scraper.new(query: 'Shopify').scrape do |article_hash|
43
+ # { author: ... }
44
+ end
45
+ ```
46
+
47
+ How the `Scraper` extracts and parses for the information is determined by scrape patterns (see **Scrape Patterns**).
48
+
49
+ ### Transformed Data
50
+
51
+ Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
52
+
53
+ In addition, the `uri` and `root_domain`(hostname) of the article will be returned in the hash too.
54
+
55
+ Example
56
+ ```
57
+ {
58
+ author: 'Linus Torvald',
59
+ body: 'The Linux kernel developed by Linus Torvald has become the backbone of most electronic devices we use to-date. It powers mobile phones, laptops, embedded devices, and even rockets...',
60
+ description: 'The Linux kernel is one of the most important contributions to the world of technology.',
61
+ keywords: 'linux,kernel,linus,torvald',
62
+ section: 'technology',
63
+ datetime: '1991-10-05T12:00:00+00:00',
64
+ title: 'Linus Linux',
65
+ uri: 'linusworld.com/the-linux-kernel',
66
+ root_domain: 'linusworld.com'
67
+ }
68
+ ```
69
+
70
+ ### Scrape Patterns
71
+
72
+ Scrape patterns are xpath or CSS patterns used by Nokogiri to extract relevant HTML elements.
73
+
74
+ Extracting each `:data_type` (see Example under **Transformed Data**) requires a scrape pattern. A few `:presets` are specified in [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml).
75
+
76
+ Since each news site (identified with `:root_domain`) uses a different markup, scrape patterns are defined on a per-`:root_domain` basis.
77
+
78
+ Specifying scrape patterns for new, undefined `:root_domains` is called training (see **Training**).
79
+
80
+ ### Training
81
+
82
+ For each `:root_domain`, it is neccesary to specify a scrape pattern for each of the `:data_type`s. A rake task was written to provide a CLI for appending new `:root_domain`s using `:preset` scrape patterns.
83
+
84
+ Simply run
85
+ ```
86
+ bundle exec rake scraper:train QUERY=<query>
87
+ ```
88
+
89
+ where the CLI will step through the articles and `:root_domain`s of the articles relevant to `<query>`.
90
+
91
+ ## Development
92
+
93
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
94
+
95
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
96
+
97
+ ## Contributing
98
+
99
+ Bug reports and pull requests are welcome on GitHub at https://github.com/richardwu/news_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
100
+
101
+
102
+ ## License
103
+
104
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
105
+
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require 'rdoc/task'
4
+
5
+ require 'news_scraper'
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.libs << 'test'
9
+ t.pattern = 'test/**/*_test.rb'
10
+ end
11
+
12
+ namespace :scraper do
13
+ desc 'CLI that steps through articles for a given query and displays preset scrape pattern results; parameters: QUERY'
14
+ task :train do
15
+ raise "QUERY param not given.\n\tUsage: bundle exec rake scraper:train QUERY=<query>" unless ENV['QUERY']
16
+ NewsScraper::Trainer.train(query: ENV['QUERY'])
17
+ end
18
+ end
19
+
20
+ RDoc::Task.new do |rdoc|
21
+ rdoc.main = "README.md"
22
+ rdoc.rdoc_files.include("README.md", "lib/*.rb", "lib/**/*.rb")
23
+ rdoc.rdoc_dir = "doc"
24
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "news_scraper"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/circle.yml ADDED
@@ -0,0 +1,3 @@
1
+ test:
2
+ pre:
3
+ - bundle exec rubocop
@@ -0,0 +1,116 @@
1
+ # All domains should include the scrape method/pattern for data_types
2
+
3
+ data_types:
4
+ - "author"
5
+ - "body"
6
+ - "description"
7
+ - "keywords"
8
+ - "section"
9
+ - "datetime"
10
+ - "title"
11
+
12
+ # All data types must include :method and :pattern
13
+ #
14
+ presets:
15
+ author:
16
+ class: &class_author
17
+ method: css
18
+ pattern: ".author"
19
+ id: &id_author
20
+ method: css
21
+ pattern: "#author"
22
+ name: &name_author
23
+ method: css
24
+ pattern: ".author-name"
25
+ link: &link_author
26
+ method: xpath
27
+ pattern: "//a[contains(@href, 'author')]"
28
+ meta: &meta_author
29
+ method: xpath
30
+ pattern: "//meta[@name='author']/@content"
31
+ rel_link: &rel_link_author
32
+ method: xpath
33
+ pattern: "//a[@rel='author']"
34
+ vcard: &vcard_author
35
+ method: css
36
+ pattern: ".vcard .fn"
37
+ body:
38
+ readability: &readability_body
39
+ method: "readability"
40
+ pattern: ""
41
+ description:
42
+ meta: &meta_description
43
+ method: "xpath"
44
+ pattern: "//meta[@name='description']/@content"
45
+ og: &og_description
46
+ method: "xpath"
47
+ pattern: "//meta[@property='og:description']/@content"
48
+ keywords:
49
+ meta: &meta_keywords
50
+ method: "xpath"
51
+ pattern: "//meta[@name='keywords']/@content"
52
+ article_tag: &article_tag_keywords
53
+ method: "xpath"
54
+ pattern: "//meta[@property='article:tag']/@content"
55
+ section:
56
+ meta: &meta_section
57
+ method: "xpath"
58
+ pattern: "//meta[@property='article:section']/@content"
59
+ datetime:
60
+ article_date_original: &article_date_original_datetime
61
+ method: xpath
62
+ pattern: //meta[@name='article_date_original']/@content
63
+ article_published_time: &article_published_time_datetime
64
+ method: "xpath"
65
+ pattern: "//meta[@property='article:published_time']/@content"
66
+ date: &date_datetime
67
+ method: xpath
68
+ pattern: //meta[@name='date']/@content
69
+ date_published: &date_published_datetime
70
+ method: xpath
71
+ pattern: //*[@itemprop='datePublished']/@datetime
72
+ og_published_time: &og_published_time_datetime
73
+ method: xpath
74
+ pattern: //meta[@property='og:published_time']/@content
75
+ original_publication_date: &original_publication_date_datetime
76
+ method: xpath
77
+ pattern: //meta[@name='OriginalPublicationDate']/@content
78
+ publication_date: &publication_date_datetime
79
+ method: xpath
80
+ pattern: //meta[@name='publication_date']/@content
81
+ publish_date: &publish_date_datetime
82
+ method: xpath
83
+ pattern: //meta[@name='PublishDate']/@content
84
+ rnews_date_published: &rnews_date_published_datetime
85
+ method: xpath
86
+ pattern: //meta[@property='rnews:datePublished']/@content
87
+ sailthru_date: &sailthru_date_datetime
88
+ method: xpath
89
+ pattern: //meta[@name='sailthru.date']/@content
90
+ title:
91
+ html: &html_title
92
+ method: "xpath"
93
+ pattern: "//title"
94
+ og: &og_title
95
+ method: "xpath"
96
+ pattern: "//meta[@property='og:title']/@content"
97
+
98
+ domains:
99
+ investors.com:
100
+ author: *rel_link_author
101
+ body:
102
+ method: "css"
103
+ pattern: ".single-post-content"
104
+ description: *og_description
105
+ keywords: *meta_keywords
106
+ section: *meta_section
107
+ datetime: *article_published_time_datetime
108
+ title: *og_title
109
+ fool.com:
110
+ author: *meta_author
111
+ body: *readability_body
112
+ description: *meta_description
113
+ keywords: *article_tag_keywords
114
+ section: *meta_section
115
+ datetime: *date_datetime
116
+ title: *og_title
@@ -0,0 +1,4 @@
1
+ extractors:
2
+ google_news_rss:
3
+ article_urls: "tmp/google_news_rss/article_urls"
4
+
data/dev.yml ADDED
@@ -0,0 +1,13 @@
1
+ name: news-scraper
2
+
3
+ up:
4
+ - ruby: 2.3.1
5
+ - bundler
6
+
7
+ commands:
8
+ rubocop:
9
+ desc: 'Lint the Ruby code with Rubocop'
10
+ run: bundle exec rubocop
11
+ aliases: [rubo, lint, l]
12
+ test: bundle exec rake test
13
+ docs: bundle exec rake rdoc
@@ -0,0 +1,11 @@
1
+ class String
2
+ def squish
3
+ dup.squish!
4
+ end
5
+
6
+ def squish!
7
+ gsub!(/[[:space:]]+/, ' ')
8
+ strip!
9
+ self
10
+ end
11
+ end
@@ -0,0 +1,106 @@
1
+ require 'readline'
2
+
3
+ module NewsScraper
4
+ module CLI
5
+ extend self
6
+
7
+ DEFAULT_COLOR = "\x1b[36m".freeze
8
+
9
+ def log(message, color: DEFAULT_COLOR, new_line: false)
10
+ message += "\n" if new_line
11
+ $stdout.puts "#{color}┃\x1b[0m " + message
12
+ end
13
+
14
+ def log_lines(message, color: DEFAULT_COLOR, new_line: false)
15
+ message.split("\n").each do |line|
16
+ log(line, color: color, new_line: new_line)
17
+ end
18
+ end
19
+
20
+ def confirm(msg, color: DEFAULT_COLOR)
21
+ print "#{color}┃\x1b[0m #{msg} (y/n) "
22
+ $stdin.gets.chomp =~ /[Yy]/
23
+ end
24
+
25
+ def get_input(msg = nil)
26
+ log(msg) if msg
27
+ Readline.completion_append_character = " "
28
+ Readline.completion_proc = nil
29
+ result = begin
30
+ Readline.readline("\x1b[34m┃ > \x1b[33m", true)
31
+ rescue Interrupt
32
+ nil
33
+ end
34
+ print "\e[0m" # reset colour
35
+ result
36
+ end
37
+
38
+ def prompt_with_options(question, options)
39
+ log(question)
40
+ log("Your options are:")
41
+ options.each.with_index(1) do |v, idx|
42
+ log("#{idx}) #{v}")
43
+ end
44
+ log("Choose a number between 1 and #{options.length}")
45
+
46
+ Readline.completion_append_character = " "
47
+ Readline.completion_proc = nil
48
+
49
+ buf = -1
50
+ available = (1..options.length).to_a
51
+ until available.include?(buf.to_i)
52
+ begin
53
+ buf = Readline.readline("\x1b[34m┃ > \x1b[33m", true)
54
+ rescue Interrupt
55
+ nil
56
+ end
57
+
58
+ if buf.nil?
59
+ STDERR.puts
60
+ next
61
+ end
62
+
63
+ buf = buf.chomp
64
+ buf = -1 if buf.empty?
65
+ buf = -1 if buf.to_i.to_s != buf
66
+ end
67
+
68
+ print "\e[0m" # reset colour
69
+ options[buf.to_i - 1]
70
+ end
71
+
72
+ ## Fancy Headers and Footers
73
+
74
+ def put_header(text = "", color = DEFAULT_COLOR)
75
+ put_edge(color, "┏━━ ", text)
76
+ end
77
+
78
+ def put_footer(color = DEFAULT_COLOR)
79
+ put_edge(color, "┗", "")
80
+ end
81
+
82
+ def put_edge(color, prefix, text)
83
+ ptext = "#{color}#{prefix}#{text}"
84
+ textwidth = printing_width(ptext)
85
+
86
+ termwidth = IO.respond_to?(:console) && IO.console ? IO.console.winsize[1] : 80
87
+ termwidth = 30 if termwidth < 30
88
+
89
+ if textwidth > termwidth
90
+ ptext = ptext[0...termwidth]
91
+ textwidth = termwidth
92
+ end
93
+ padwidth = termwidth - textwidth
94
+ pad = "━" * padwidth
95
+ formatted = "#{ptext}#{color}#{pad}\x1b[0m\n"
96
+
97
+ $stdout.puts formatted
98
+ end
99
+
100
+ # ANSI escape sequences (like \x1b[31m) have zero width.
101
+ # when calculating the padding width, we must exclude them.
102
+ def printing_width(str)
103
+ str.gsub(/\x1b\[[\d;]+[A-z]/, '').size
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,6 @@
1
+ module NewsScraper
2
+ module Constants
3
+ SCRAPE_PATTERN_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
4
+ SCRAPE_PATTERNS = YAML.load_file(SCRAPE_PATTERN_FILEPATH)
5
+ end
6
+ end
@@ -0,0 +1,16 @@
1
+
2
+ module NewsScraper
3
+ class ResponseError < StandardError; end
4
+
5
+ module Transformers
6
+ class ScrapePatternNotDefined < StandardError
7
+ attr_reader :root_domain, :uri
8
+
9
+ def initialize(opts = {})
10
+ @root_domain = opts[:root_domain]
11
+ @uri = opts[:uri]
12
+ super
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ require 'nokogiri'
2
+
3
+ module NewsScraper
4
+ module Extractors
5
+ class Article
6
+ include ExtractorsHelpers
7
+
8
+ def initialize(url:)
9
+ @url = url
10
+ end
11
+
12
+ def extract
13
+ http_request(@url).body
14
+ end
15
+ end
16
+ end
17
+ end