RubyGems - web2text - Versions diffs - 0.0.1 - Mend

web2text 0.0.1

Files changed (13) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6e1c4b836214eee9b4901d606ebf4210744e4fae
+  data.tar.gz: c045333d81e7e5e4a5d2fac1d1b39d9d9ab0194e
+SHA512:
+  metadata.gz: ae8c3350043a9a9213b6ec63429238d582736ecb48b934e0a51329dc69e4fad775c5f5c764628fb6ff5367f9bc8d1ebec4e867e0b12081e1cd98724b100fd56f
+  data.tar.gz: 93853ba2c589dda156578037eaaf76fe9647ba2c8b468ddbe12ce2a995d92b28e3e05b11650efe3f1732b8fe43f5f7f78885aa4a62dffda92f71743f3ac64d11

data/bin/web2text ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require 'web2text'
+begin
+  options = Web2Text::parse_cli ARGV
+  Web2Text::do_crawl options
+rescue Web2Text::Error => e
+  puts "#{e.to_s} Try -h for help"
+end

data/lib/web2text.rb ADDED

@@ -0,0 +1,123 @@
+require 'anemone'
+require 'nokogiri'
+require 'optparse'
+module Web2Text
+  class Error < RuntimeError
+    def initialize(msg)
+      super msg
+    end
+  end
+  class CommandError < Error
+    def initialize(msg)
+      super msg
+    end
+  end
+  def self.parse_cli(args)
+    options = {
+      query: "body",
+      sleep: 0.0,
+      avoid: [],
+      focus: [],
+      formatter: LinePrinter,
+      ignore_robots_txt: false,
+      out: $stdout,
+    }
+    args = args.clone
+    OptionParser.new do |opts|
+        opts.banner = "Usage: web2text [options] http://example.com/"
+        opts.on("-q", "--css", "--query=CSS_QUERY", String) do |q|
+          options[:query] = q
+        end
+        opts.on("-s [OPTIONAL]", "--sleep [OPTIONAL]", Float, "Delay between requests. Default 1, -s sets to 1.") do |n|
+          options[:sleep] = n || 1.0
+        end
+        opts.on("--avoid x,y,z", Array, "List of paths to avoid when crawling. These paths and everything below them will be ignored.") do |avoid|
+          options[:avoid] = avoid
+        end
+        opts.on("--focus x,y,z", Array, "List of paths to process when crawling. Only these paths and pages below them will be processed") do |focus|
+          options[:focus] = focus
+        end
+        opts.on("--lines [web2.txt]", String, "One line per page. Can print to std out or a file.") do |f|
+          options[:formatter] = LinePrinter
+          options[:out] = if f then File.open(f, 'w') else $stdout end
+        end
+        opts.on("--files out/", String, "One file per page. Following website structure, in the specified directory.") do |o|
+          options[:formatter] = FilePrinter
+          options[:out] = Pathname(o)
+          if options[:out].exist? and !options[:out].directory? then
+            raise Web2Text::CommandError.new 'argument to --files must be a directory'
+          end
+        end
+        opts.on("--bad-robot", "Ignore robots.txt") do
+          options[:ignore_robots_txt] = true
+        end
+        opts.on_tail("-h", "--help", "Show this message") do
+            puts opts
+            exit
+        end
+    end.parse! args
+    if args.length != 1 then
+      raise Web2Text::CommandError.new 'incorrect number of arguments!'
+    end
+    options[:url] = args[0]
+    options
+  end
+  def self.do_crawl(options)
+    crawl = Crawl.new options[:url], options[:avoid], options[:focus]
+    crawler = Crawler.new crawl, options[:query]
+    formatter = options[:formatter].new crawl, options[:out]
+    Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
+        anemone.focus_crawl do |page|
+          crawl.filter page.links
+        end
+        anemone.on_every_page do |page|
+            STDERR.puts page.url
+            # ignore redirects
+            code = page.code || 200
+            if 300 <= code and code < 400
+              next
+            elsif !crawl.focus? page.url
+              next
+            elsif page.doc.nil?
+              STDERR.puts "ERR: Failed to retrieve #{page.url}"
+              next
+            end
+            plain = crawler.doc_as_plaintext page.doc
+            formatter.append plain, page.url
+            sleep options[:sleep]
+        end
+        anemone.after_crawl do
+          formatter.close
+        end
+    end
+  end
+end
+require 'web2text/crawl'
+require 'web2text/crawler'
+require 'web2text/formatters'

data/lib/web2text/crawl.rb ADDED

@@ -0,0 +1,45 @@
+require 'uri'
+class Web2Text::Crawl
+  attr_reader :url
+  def initialize(url, avoid = [], focus = [])
+    @url = url
+    @avoid = avoid.map { |a|
+      a = URI::join(url, a) if !a.start_with? url
+      a.to_s
+    }
+    @focus = focus.map { |a|
+      a = URI::join(url, a) if !a.start_with? url
+      a.to_s
+    }
+  end
+  def filter(urls)
+    urls.reject {|u| self.skip? u}
+  end
+  def skip?(url)
+    url_s = url.to_s
+    if !url_s.start_with? @url
+      return true
+    end
+    @avoid.any? { |a|
+      url_s.start_with? a
+    }
+  end
+  def focus?(url)
+    if @focus.empty?
+      true
+    else
+      url_s = url.to_s
+      @focus.any? { |a|
+        url_s.start_with? a
+      }
+    end
+  end
+end

data/lib/web2text/crawler.rb ADDED

@@ -0,0 +1,19 @@
+class Web2Text::Crawler
+  def initialize(crawl, query="body")
+    @crawl = crawl
+    @query = query
+  end
+  def doc_as_plaintext(doc)
+    # just using inner_text doesn't give us quite enough spaces :(
+    doc.css(@query).collect do |j|
+      bits = []
+      j.traverse do |c|
+        if c.text? then bits.push c.content end
+      end
+      bits.join(' ')
+    end.join(' ')
+  end
+end

data/lib/web2text/formatters.rb ADDED

@@ -0,0 +1,50 @@
+require 'uri'
+class Web2Text::LinePrinter
+  def initialize(crawl, output)
+    @output = output
+    @first = true
+  end
+  def append(doc, uri)
+    if !@first then
+      @output.write "\n"
+    end
+    @first = false
+    @output.write doc.gsub(/\n+/, ' ')
+    self
+  end
+  def close
+    @output.close
+  end
+end
+# Writes one file per page
+class Web2Text::FilePrinter
+  def initialize(crawl, out_dir)
+    root_path = URI(crawl.url).path.to_s
+    root_path = "/" if root_path.empty?
+    @crawl_root = Pathname(root_path)
+    @out_dir = Pathname(out_dir)
+    @out_dir.mkpath
+  end
+  def append(doc, uri)
+    path = @out_dir + Pathname(URI(uri).path).relative_path_from(@crawl_root)
+    if path.extname == "" then
+      path = path + 'index.txt'
+    end
+    path = path.sub_ext('.txt')
+    path.parent.mkpath
+    path.open("w") { |f| f.write(doc) }
+    self
+  end
+  def close
+  end
+end

data/lib/web2text/version.rb ADDED

@@ -0,0 +1,3 @@
+module Web2Text
+  VERSION = "0.0.1"
+end

data/spec/crawl_spec.rb ADDED

@@ -0,0 +1,45 @@
+require 'web2text'
+root = "http://example.com"
+RSpec.describe Crawl, '#filter' do
+  context "with no patterns" do
+    it "returns all links" do
+      crawl = Crawl.new root
+      links = ["#{root}/wow", "#{root}/neat"]
+      expect(crawl.filter links).to eq links
+      expect(links.select {|u| crawl.focus? u}).to eq links
+    end
+    it "will not crawl above the root" do
+      crawl = Crawl.new "#{root}/wow/cool"
+      expect(crawl.skip? root).to be true
+    end
+  end
+  context "with patterns" do
+    it "can filter out whole directories" do
+      good = ["#{root}/wow", "#{root}/neat"]
+      bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
+      crawl = Crawl.new root, ["#{root}/avoid"]
+      expect(crawl.filter good + bad).to eq good
+    end
+    it "can focus on pages" do
+      bad = ["#{root}/avoid", "#{root}/avoid"]
+      good = ["#{root}/focus", "#{root}/focus/index.html", "#{root}/focus/this/nested/stuff"]
+      crawl = Crawl.new root, [], ["#{root}/focus"]
+      expect((good + bad).select {|u| crawl.focus? u}).to eq good
+    end
+    it "can skip host name parts to filter out directories" do
+      good = ["#{root}/wow", "#{root}/neat"]
+      bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
+      crawl = Crawl.new root, ["/avoid"]
+      expect(crawl.filter good + bad).to eq good
+    end
+  end
+end

data/spec/crawler_spec.rb ADDED

@@ -0,0 +1,39 @@
+require 'nokogiri'
+require 'web2text'
+RSpec.describe Crawler, '#process_doc' do
+  before(:all) do
+    @root = "http://example.com"
+    @crawl = Crawl.new @root
+    @h1_content = "This is a document"
+    @p_content = "good stuff!"
+    @example_html = Nokogiri::HTML "<!doctype html><html><head></head><body><h1>#{@h1_content}</h1><p>#{@p_content}</p></body></html>"
+  end
+  before(:each) do
+    @crawler = Crawler.new @crawl
+  end
+  it 'can consider a page and make output' do
+    out = @crawler.doc_as_plaintext @example_html
+    expect(out).to eq "#{@h1_content} #{@p_content}"
+  end
+  it 'can limit the output by using css queries' do
+    tests = [
+      ["p", @p_content],
+      ["h1", @h1_content],
+      ["p, h1", "#{@h1_content} #{@p_content}"],
+      ["h1, p", "#{@h1_content} #{@p_content}"]
+    ]
+    tests.each do |test|
+      @crawler = Crawler.new @crawl, test[0]
+      out = @crawler.doc_as_plaintext @example_html
+      expect(out).to eq(test[1]), "with css query '#{test[0]}', got '#{out}', but expected '#{test[1]}'"
+    end
+  end
+end

data/spec/formatters_spec.rb ADDED

@@ -0,0 +1,81 @@
+require 'rspec'
+require 'stringio'
+require 'test_construct'
+require 'web2text'
+LinePrinter = Web2Text::LinePrinter
+FilePrinter = Web2Text::FilePrinter
+doc1 = "This is a document\nwith a newline"
+doc2 = "This is another document"
+root = 'http://example.com/wow/'
+RSpec.describe LinePrinter, '#append' do
+  it 'prints one line per document' do
+    crawl = Crawl.new root
+    result = StringIO::open do |out|
+      LinePrinter.new(crawl, out)
+        .append(doc1, "#{root}index.html")
+        .append(doc2, "#{root}/cool/index.html")
+      out.string
+    end
+    expect(result.lines.length).to eq(2)
+    expect(result.lines[1]).to eq(doc2)
+  end
+end
+RSpec.describe FilePrinter, '#append' do
+  include TestConstruct::Helpers
+  it 'prints one file per document' do
+    crawl = Crawl.new root
+    folder = 'test_output/'
+    within_construct() do |construct|
+      construct.directory 'fileprinter_web2text' do |d|
+        FilePrinter.new(crawl, folder)
+          .append(doc1, "#{root}/")
+          .append(doc2, "#{root}/cool/index.html")
+          .append(doc1, "#{root}/no_slash")
+        doc1_path = File.join folder, 'index.txt'
+        expect(File::file?(doc1_path)).to be_truthy
+        expect(IO.read(doc1_path)).to eq(doc1)
+        doc2_path = File.join(folder, 'cool', 'index.txt')
+        expect(File.file?(doc2_path)).to be_truthy
+        expect(IO.read(doc2_path)).to eq(doc2)
+        doc3_path = File.join folder, 'no_slash', 'index.txt'
+        expect(File::file?(doc3_path)).to be_truthy
+        expect(IO.read(doc3_path)).to eq(doc1)
+      end
+    end
+  end
+end
+RSpec.describe FilePrinter do
+  include TestConstruct::Helpers
+  it "doesn't choke on roots with no path (eg. http://example.com)" do
+    tricky = "http://example.com"
+    crawl = Crawl.new tricky
+    folder = 'test_output/'
+    within_construct() do |construct|
+      construct.directory 'fileprinter_web2text' do |d|
+        FilePrinter.new(crawl, folder)
+          .append(doc1, "#{tricky}/")
+        doc1_path = File.join folder, 'index.txt'
+        expect(File::file?(doc1_path)).to be_truthy
+        expect(IO.read(doc1_path)).to eq(doc1)
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,27 @@
+require 'rspec'
+require 'web2text'
+Crawler = Web2Text::Crawler
+Crawl = Web2Text::Crawl
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  config.expect_with :rspec do |expectations|
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  config.mock_with :rspec do |mocks|
+    mocks.verify_partial_doubles = true
+  end
+  config.warnings = true
+  config.order = :random
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+end

data/spec/web2text_spec.rb ADDED

@@ -0,0 +1,30 @@
+require 'web2text'
+require 'web2text/version'
+require 'shellwords'
+ROOT = "http://example.com"
+def parse(args)
+  Web2Text.parse_cli "#{args} #{ROOT}".shellsplit
+end
+RSpec.describe Web2Text do
+  it 'has a semver VERSION' do
+    expect(Web2Text::VERSION =~ /\d+\.\d+\.\d+/).to be 0
+  end
+end
+RSpec.describe Web2Text, '#parse_cli' do
+  it 'defaults to 0 sleep' do
+    expect(parse('')[:sleep]).to be 0.0
+  end
+  it 'sleeps for 1s with -s' do
+    expect(parse('-s')[:sleep]).to be 1.0
+  end
+  it 'can specify sleep with -s N or --sleep N' do
+    expect(parse('-s 2')[:sleep]).to be 2.0
+    expect(parse('--sleep 2')[:sleep]).to be 2.0
+  end
+end

metadata ADDED

@@ -0,0 +1,143 @@
+--- !ruby/object:Gem::Specification
+name: web2text
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Alex Wilson
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-07-16 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: anemone
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.7'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.6.6
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.6.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.6.6
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.6.2
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 3.3.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 3.3.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 10.4.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 10.4.2
+- !ruby/object:Gem::Dependency
+  name: test_construct
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+description:
+email:
+executables:
+- web2text
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/web2text
+- lib/web2text.rb
+- lib/web2text/crawl.rb
+- lib/web2text/crawler.rb
+- lib/web2text/formatters.rb
+- lib/web2text/version.rb
+- spec/crawl_spec.rb
+- spec/crawler_spec.rb
+- spec/formatters_spec.rb
+- spec/spec_helper.rb
+- spec/web2text_spec.rb
+homepage: https://github.com/yourpalal/web2text
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Scrape a website as plain text.
+test_files: []