RubyGems - web2text - Versions diffs - 0.0.1 - Mend

web2text 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 6e1c4b836214eee9b4901d606ebf4210744e4fae
+  data.tar.gz: c045333d81e7e5e4a5d2fac1d1b39d9d9ab0194e
+SHA512:
+  metadata.gz: ae8c3350043a9a9213b6ec63429238d582736ecb48b934e0a51329dc69e4fad775c5f5c764628fb6ff5367f9bc8d1ebec4e867e0b12081e1cd98724b100fd56f
+  data.tar.gz: 93853ba2c589dda156578037eaaf76fe9647ba2c8b468ddbe12ce2a995d92b28e3e05b11650efe3f1732b8fe43f5f7f78885aa4a62dffda92f71743f3ac64d11

data/bin/web2text ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+require 'web2text'
+begin
+  options = Web2Text::parse_cli ARGV
+  Web2Text::do_crawl options
+rescue Web2Text::Error => e
+  puts "#{e.to_s} Try -h for help"
+end

data/lib/web2text.rb ADDED

@@ -0,0 +1,123 @@
+require 'anemone'
+require 'nokogiri'
+require 'optparse'
+module Web2Text
+  class Error < RuntimeError
+    def initialize(msg)
+      super msg
+    end
+  end
+  class CommandError < Error
+    def initialize(msg)
+      super msg
+    end
+  end
+  def self.parse_cli(args)
+    options = {
+      query: "body",
+      sleep: 0.0,
+      avoid: [],
+      focus: [],
+      formatter: LinePrinter,
+      ignore_robots_txt: false,
+      out: $stdout,
+    }
+    args = args.clone
+    OptionParser.new do |opts|
+        opts.banner = "Usage: web2text [options] http://example.com/"
+        opts.on("-q", "--css", "--query=CSS_QUERY", String) do |q|
+          options[:query] = q
+        end
+        opts.on("-s [OPTIONAL]", "--sleep [OPTIONAL]", Float, "Delay between requests. Default 1, -s sets to 1.") do |n|
+          options[:sleep] = n || 1.0
+        end
+        opts.on("--avoid x,y,z", Array, "List of paths to avoid when crawling. These paths and everything below them will be ignored.") do |avoid|
+          options[:avoid] = avoid
+        end
+        opts.on("--focus x,y,z", Array, "List of paths to process when crawling. Only these paths and pages below them will be processed") do |focus|
+          options[:focus] = focus
+        end
+        opts.on("--lines [web2.txt]", String, "One line per page. Can print to std out or a file.") do |f|
+          options[:formatter] = LinePrinter
+          options[:out] = if f then File.open(f, 'w') else $stdout end
+        end
+        opts.on("--files out/", String, "One file per page. Following website structure, in the specified directory.") do |o|
+          options[:formatter] = FilePrinter
+          options[:out] = Pathname(o)
+          if options[:out].exist? and !options[:out].directory? then
+            raise Web2Text::CommandError.new 'argument to --files must be a directory'
+          end
+        end
+        opts.on("--bad-robot", "Ignore robots.txt") do
+          options[:ignore_robots_txt] = true
+        end
+        opts.on_tail("-h", "--help", "Show this message") do
+            puts opts
+            exit
+        end
+    end.parse! args
+    if args.length != 1 then
+      raise Web2Text::CommandError.new 'incorrect number of arguments!'
+    end
+    options[:url] = args[0]
+    options
+  end
+  def self.do_crawl(options)
+    crawl = Crawl.new options[:url], options[:avoid], options[:focus]
+    crawler = Crawler.new crawl, options[:query]
+    formatter = options[:formatter].new crawl, options[:out]
+    Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
+        anemone.focus_crawl do |page|
+          crawl.filter page.links
+        end
+        anemone.on_every_page do |page|
+            STDERR.puts page.url
+            # ignore redirects
+            code = page.code || 200
+            if 300 <= code and code < 400
+              next
+            elsif !crawl.focus? page.url
+              next
+            elsif page.doc.nil?
+              STDERR.puts "ERR: Failed to retrieve #{page.url}"
+              next
+            end
+            plain = crawler.doc_as_plaintext page.doc
+            formatter.append plain, page.url
+            sleep options[:sleep]
+        end
+        anemone.after_crawl do
+          formatter.close
+        end
+    end
+  end
+end
+require 'web2text/crawl'
+require 'web2text/crawler'
+require 'web2text/formatters'

data/lib/web2text/crawl.rb ADDED

@@ -0,0 +1,45 @@
+require 'uri'
+class Web2Text::Crawl
+  attr_reader :url
+  def initialize(url, avoid = [], focus = [])
+    @url = url
+    @avoid = avoid.map { |a|
+      a = URI::join(url, a) if !a.start_with? url
+      a.to_s
+    }
+    @focus = focus.map { |a|
+      a = URI::join(url, a) if !a.start_with? url
+      a.to_s
+    }
+  end
+  def filter(urls)
+    urls.reject {|u| self.skip? u}
+  end
+  def skip?(url)
+    url_s = url.to_s
+    if !url_s.start_with? @url
+      return true
+    end
+    @avoid.any? { |a|
+      url_s.start_with? a
+    }
+  end
+  def focus?(url)
+    if @focus.empty?
+      true
+    else
+      url_s = url.to_s
+      @focus.any? { |a|
+        url_s.start_with? a
+      }
+    end
+  end
+end

data/lib/web2text/crawler.rb ADDED

@@ -0,0 +1,19 @@
+class Web2Text::Crawler
+  def initialize(crawl, query="body")
+    @crawl = crawl
+    @query = query
+  end
+  def doc_as_plaintext(doc)
+    # just using inner_text doesn't give us quite enough spaces :(
+    doc.css(@query).collect do |j|
+      bits = []
+      j.traverse do |c|
+        if c.text? then bits.push c.content end
+      end
+      bits.join(' ')
+    end.join(' ')
+  end
+end

data/lib/web2text/formatters.rb ADDED

@@ -0,0 +1,50 @@
+require 'uri'
+class Web2Text::LinePrinter
+  def initialize(crawl, output)
+    @output = output
+    @first = true
+  end
+  def append(doc, uri)
+    if !@first then
+      @output.write "\n"
+    end
+    @first = false
+    @output.write doc.gsub(/\n+/, ' ')
+    self
+  end
+  def close
+    @output.close
+  end
+end
+# Writes one file per page
+class Web2Text::FilePrinter
+  def initialize(crawl, out_dir)
+    root_path = URI(crawl.url).path.to_s
+    root_path = "/" if root_path.empty?
+    @crawl_root = Pathname(root_path)
+    @out_dir = Pathname(out_dir)
+    @out_dir.mkpath
+  end
+  def append(doc, uri)
+    path = @out_dir + Pathname(URI(uri).path).relative_path_from(@crawl_root)
+    if path.extname == "" then
+      path = path + 'index.txt'
+    end
+    path = path.sub_ext('.txt')
+    path.parent.mkpath
+    path.open("w") { |f| f.write(doc) }
+    self
+  end
+  def close
+  end
+end

data/lib/web2text/version.rb ADDED

@@ -0,0 +1,3 @@
+module Web2Text
+  VERSION = "0.0.1"
+end

data/spec/crawl_spec.rb ADDED

@@ -0,0 +1,45 @@
+require 'web2text'
+root = "http://example.com"
+RSpec.describe Crawl, '#filter' do
+  context "with no patterns" do
+    it "returns all links" do
+      crawl = Crawl.new root
+      links = ["#{root}/wow", "#{root}/neat"]
+      expect(crawl.filter links).to eq links
+      expect(links.select {|u| crawl.focus? u}).to eq links
+    end
+    it "will not crawl above the root" do
+      crawl = Crawl.new "#{root}/wow/cool"
+      expect(crawl.skip? root).to be true
+    end
+  end
+  context "with patterns" do
+    it "can filter out whole directories" do
+      good = ["#{root}/wow", "#{root}/neat"]
+      bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
+      crawl = Crawl.new root, ["#{root}/avoid"]
+      expect(crawl.filter good + bad).to eq good
+    end
+    it "can focus on pages" do
+      bad = ["#{root}/avoid", "#{root}/avoid"]
+      good = ["#{root}/focus", "#{root}/focus/index.html", "#{root}/focus/this/nested/stuff"]
+      crawl = Crawl.new root, [], ["#{root}/focus"]
+      expect((good + bad).select {|u| crawl.focus? u}).to eq good
+    end
+    it "can skip host name parts to filter out directories" do
+      good = ["#{root}/wow", "#{root}/neat"]
+      bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
+      crawl = Crawl.new root, ["/avoid"]
+      expect(crawl.filter good + bad).to eq good
+    end
+  end
+end

data/spec/crawler_spec.rb ADDED

@@ -0,0 +1,39 @@
+require 'nokogiri'
+require 'web2text'
+RSpec.describe Crawler, '#process_doc' do
+  before(:all) do
+    @root = "http://example.com"
+    @crawl = Crawl.new @root
+    @h1_content = "This is a document"
+    @p_content = "good stuff!"
+    @example_html = Nokogiri::HTML "<!doctype html><html><head></head><body><h1>#{@h1_content}</h1><p>#{@p_content}</p></body></html>"
+  end
+  before(:each) do
+    @crawler = Crawler.new @crawl
+  end
+  it 'can consider a page and make output' do
+    out = @crawler.doc_as_plaintext @example_html
+    expect(out).to eq "#{@h1_content} #{@p_content}"
+  end
+  it 'can limit the output by using css queries' do
+    tests = [
+      ["p", @p_content],
+      ["h1", @h1_content],
+      ["p, h1", "#{@h1_content} #{@p_content}"],
+      ["h1, p", "#{@h1_content} #{@p_content}"]
+    ]
+    tests.each do |test|
+      @crawler = Crawler.new @crawl, test[0]
+      out = @crawler.doc_as_plaintext @example_html
+      expect(out).to eq(test[1]), "with css query '#{test[0]}', got '#{out}', but expected '#{test[1]}'"
+    end
+  end
+end

data/spec/formatters_spec.rb ADDED

@@ -0,0 +1,81 @@
+require 'rspec'
+require 'stringio'
+require 'test_construct'
+require 'web2text'
+LinePrinter = Web2Text::LinePrinter
+FilePrinter = Web2Text::FilePrinter
+doc1 = "This is a document\nwith a newline"
+doc2 = "This is another document"
+root = 'http://example.com/wow/'
+RSpec.describe LinePrinter, '#append' do
+  it 'prints one line per document' do
+    crawl = Crawl.new root
+    result = StringIO::open do |out|
+      LinePrinter.new(crawl, out)
+        .append(doc1, "#{root}index.html")
+        .append(doc2, "#{root}/cool/index.html")
+      out.string
+    end
+    expect(result.lines.length).to eq(2)
+    expect(result.lines[1]).to eq(doc2)
+  end
+end
+RSpec.describe FilePrinter, '#append' do
+  include TestConstruct::Helpers
+  it 'prints one file per document' do
+    crawl = Crawl.new root
+    folder = 'test_output/'
+    within_construct() do |construct|
+      construct.directory 'fileprinter_web2text' do |d|
+        FilePrinter.new(crawl, folder)
+          .append(doc1, "#{root}/")
+          .append(doc2, "#{root}/cool/index.html")
+          .append(doc1, "#{root}/no_slash")
+        doc1_path = File.join folder, 'index.txt'
+        expect(File::file?(doc1_path)).to be_truthy
+        expect(IO.read(doc1_path)).to eq(doc1)
+        doc2_path = File.join(folder, 'cool', 'index.txt')
+        expect(File.file?(doc2_path)).to be_truthy
+        expect(IO.read(doc2_path)).to eq(doc2)
+        doc3_path = File.join folder, 'no_slash', 'index.txt'
+        expect(File::file?(doc3_path)).to be_truthy
+        expect(IO.read(doc3_path)).to eq(doc1)
+      end
+    end
+  end
+end
+RSpec.describe FilePrinter do
+  include TestConstruct::Helpers
+  it "doesn't choke on roots with no path (eg. http://example.com)" do
+    tricky = "http://example.com"
+    crawl = Crawl.new tricky
+    folder = 'test_output/'
+    within_construct() do |construct|
+      construct.directory 'fileprinter_web2text' do |d|
+        FilePrinter.new(crawl, folder)
+          .append(doc1, "#{tricky}/")
+        doc1_path = File.join folder, 'index.txt'
+        expect(File::file?(doc1_path)).to be_truthy
+        expect(IO.read(doc1_path)).to eq(doc1)
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,27 @@
+require 'rspec'
+require 'web2text'
+Crawler = Web2Text::Crawler
+Crawl = Web2Text::Crawl
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  config.expect_with :rspec do |expectations|
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  config.mock_with :rspec do |mocks|
+    mocks.verify_partial_doubles = true
+  end
+  config.warnings = true
+  config.order = :random
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+end

data/spec/web2text_spec.rb ADDED

@@ -0,0 +1,30 @@
+require 'web2text'
+require 'web2text/version'
+require 'shellwords'
+ROOT = "http://example.com"
+def parse(args)
+  Web2Text.parse_cli "#{args} #{ROOT}".shellsplit
+end
+RSpec.describe Web2Text do
+  it 'has a semver VERSION' do
+    expect(Web2Text::VERSION =~ /\d+\.\d+\.\d+/).to be 0
+  end
+end
+RSpec.describe Web2Text, '#parse_cli' do
+  it 'defaults to 0 sleep' do
+    expect(parse('')[:sleep]).to be 0.0
+  end
+  it 'sleeps for 1s with -s' do
+    expect(parse('-s')[:sleep]).to be 1.0
+  end
+  it 'can specify sleep with -s N or --sleep N' do
+    expect(parse('-s 2')[:sleep]).to be 2.0
+    expect(parse('--sleep 2')[:sleep]).to be 2.0
+  end
+end

metadata ADDED

@@ -0,0 +1,143 @@
+--- !ruby/object:Gem::Specification
+name: web2text
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Alex Wilson
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-07-16 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: anemone
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.7'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.6.6
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.6.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.6.6
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.6.6.2
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 3.3.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.3'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 3.3.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 10.4.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 10.4.2
+- !ruby/object:Gem::Dependency
+  name: test_construct
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+description:
+email:
+executables:
+- web2text
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/web2text
+- lib/web2text.rb
+- lib/web2text/crawl.rb
+- lib/web2text/crawler.rb
+- lib/web2text/formatters.rb
+- lib/web2text/version.rb
+- spec/crawl_spec.rb
+- spec/crawler_spec.rb
+- spec/formatters_spec.rb
+- spec/spec_helper.rb
+- spec/web2text_spec.rb
+homepage: https://github.com/yourpalal/web2text
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Scrape a website as plain text.
+test_files: []