RubyGems - docparser - Versions diffs - 0.0.1 → 0.1.0 - Mend

docparser 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/.rubocop.yml +5 -0
data/.travis.yml +3 -0
data/Gemfile +9 -1
data/README.md +11 -4
data/Rakefile +15 -0
data/example.rb +9 -7
data/lib/docparser.rb +1 -0
data/lib/docparser/document.rb +18 -11
data/lib/docparser/output.rb +8 -8
data/lib/docparser/output/html_output.rb +53 -47
data/lib/docparser/output/json_output.rb +8 -3
data/lib/docparser/output/multi_output.rb +4 -8
data/lib/docparser/output/nil_output.rb +21 -0
data/lib/docparser/output/screen_output.rb +2 -1
data/lib/docparser/output/xlsx_output.rb +12 -2
data/lib/docparser/output/yaml_output.rb +6 -1
data/lib/docparser/parser.rb +80 -49
data/lib/docparser/version.rb +1 -1
data/test/lib/docparser/blackbox_test.rb +29 -0
data/test/lib/docparser/document_test.rb +134 -0
data/test/lib/docparser/logging_test.rb +19 -0
data/test/lib/docparser/output/csv_output_test.rb +51 -0
data/test/lib/docparser/output/html_output_test.rb +57 -0
data/test/lib/docparser/output/json_output_test.rb +65 -0
data/test/lib/docparser/output/multi_output_test.rb +80 -0
data/test/lib/docparser/output/nil_output_test.rb +27 -0
data/test/lib/docparser/output/screen_output_test.rb +55 -0
data/test/lib/docparser/output/xlsx_output_test.rb +53 -0
data/test/lib/docparser/output/yaml_output_test.rb +76 -0
data/test/lib/docparser/output_test.rb +85 -0
data/test/lib/docparser/parser_test.rb +197 -0
data/test/lib/docparser/version_test.rb +11 -0
data/test/support/hackaday/dl.rb +4 -0
data/test/support/hackaday/file_1.html +716 -0
data/test/support/hackaday/file_10.html +791 -0
data/test/support/hackaday/file_11.html +787 -0
data/test/support/hackaday/file_12.html +715 -0
data/test/support/hackaday/file_13.html +793 -0
data/test/support/hackaday/file_14.html +718 -0
data/test/support/hackaday/file_15.html +707 -0
data/test/support/hackaday/file_16.html +713 -0
data/test/support/hackaday/file_17.html +715 -0
data/test/support/hackaday/file_18.html +725 -0
data/test/support/hackaday/file_19.html +715 -0
data/test/support/hackaday/file_2.html +793 -0
data/test/support/hackaday/file_20.html +795 -0
data/test/support/hackaday/file_21.html +804 -0
data/test/support/hackaday/file_22.html +722 -0
data/test/support/hackaday/file_23.html +793 -0
data/test/support/hackaday/file_24.html +717 -0
data/test/support/hackaday/file_25.html +715 -0
data/test/support/hackaday/file_26.html +717 -0
data/test/support/hackaday/file_27.html +723 -0
data/test/support/hackaday/file_28.html +711 -0
data/test/support/hackaday/file_29.html +711 -0
data/test/support/hackaday/file_3.html +794 -0
data/test/support/hackaday/file_30.html +715 -0
data/test/support/hackaday/file_31.html +713 -0
data/test/support/hackaday/file_32.html +714 -0
data/test/support/hackaday/file_33.html +716 -0
data/test/support/hackaday/file_34.html +714 -0
data/test/support/hackaday/file_35.html +792 -0
data/test/support/hackaday/file_36.html +719 -0
data/test/support/hackaday/file_37.html +712 -0
data/test/support/hackaday/file_38.html +709 -0
data/test/support/hackaday/file_39.html +808 -0
data/test/support/hackaday/file_4.html +814 -0
data/test/support/hackaday/file_40.html +801 -0
data/test/support/hackaday/file_5.html +715 -0
data/test/support/hackaday/file_6.html +792 -0
data/test/support/hackaday/file_7.html +714 -0
data/test/support/hackaday/file_8.html +717 -0
data/test/support/hackaday/file_9.html +719 -0
data/test/support/test_encoding.html +12 -0
data/test/support/test_encoding2.html +12 -0
data/test/support/test_html.html +16 -0
data/test/support/test_xml.xml +5 -0
data/test/test_helper.rb +14 -0
metadata +126 -3

data/lib/docparser/output/nil_output.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module DocParser
+  # This Output is used for testing purposes.
+  # @see Output
+  class NilOutput < Output
+    # @!visibility private
+    def initialize
+      @rowcount = 0
+    end
+    def close
+    end
+    def write_row(row)
+    end
+    def add_row(row)
+    end
+  end
+end

data/lib/docparser/output/screen_output.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require 'terminal-table'
 require 'pageme'
 module DocParser
-  # This Output is can be used for debugging purposes.
+  # This Output can be used for debugging purposes.
   # It pipes all rows through a pager
   # @see Output
@@ -25,6 +25,7 @@ module DocParser
     end
     def write_row(row)
+      raise MissingHeaderException if @header.nil? || @header.length == 0
       out = []
       0.upto(@header.length - 1) do |counter|
         out << [@header[counter], row[counter]]

data/lib/docparser/output/xlsx_output.rb CHANGED Viewed

@@ -20,9 +20,19 @@ module DocParser
       @sheet.add_row row
     end
-    def close
-      @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
+    def footer
+      unless @header.nil?
+        @sheet.add_table "A1:#{@sheet.cells.last.r}", name: 'Data'
+      end
       @package.serialize @filename
     end
+    def rowcount
+      if @header.nil?
+        @sheet.rows.length
+      else
+        @sheet.rows.length - 1
+      end
+    end
   end
 end

data/lib/docparser/output/yaml_output.rb CHANGED Viewed

@@ -6,9 +6,14 @@ module DocParser
   class YAMLOutput < Output
     # @!visibility private
     def write_row(row)
+      raise MissingHeaderException if @header.nil? || @header.length == 0
       @doc ||= {}
       0.upto(@header.length - 1) do |counter|
-        @doc[@header[counter]] = row[counter] rescue ''
+        if row.length > counter
+          @doc[@header[counter]] = row[counter]
+        else
+          @doc[@header[counter]] = ''
+        end
       end
       YAML.dump @doc, @file
     end

data/lib/docparser/parser.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-$:.unshift __dir__
+$LOAD_PATH.unshift __dir__
 require 'rubygems'
 require 'bundler/setup'
 require 'version'
@@ -8,6 +8,8 @@ require 'nokogiri'
 require 'open-uri'
 require 'parallel'
 require 'set'
+require 'log4r'
+require 'log4r/formatter/patternformatter'
 require 'output/screen_output.rb'
 require 'output/csv_output.rb'
 require 'output/html_output.rb'
@@ -15,14 +17,26 @@ require 'output/xlsx_output.rb'
 require 'output/yaml_output.rb'
 require 'output/json_output.rb'
 require 'output/multi_output.rb'
-# {include:file:README.md}
+require 'output/nil_output.rb'
+Log4r.define_levels(*Log4r::Log4rConfig::LogLevels)
+logger = Log4r::Logger.new('docparser')
+output = Log4r::StdoutOutputter.new('docparser')
+output.formatter = Log4r::PatternFormatter.new(pattern: '[%l %C] %d :: %m')
+logger.outputters = output
+logger.level = Log4r::INFO
+logger = nil
+output = nil
+# The DocParser namespace
+# See README.md for information on using DocParser
 module DocParser
   # The main parser class. This is the class you'll use to create your parser
   # The real work happens in the Document class
   # @see Document
   class Parser
     # @!visibility private
-    attr_reader :outputs
+    attr_reader :outputs, :files, :num_processes, :encoding
     # Creates a new parser instance
     # @param files [Array] An array containing URLs or paths to files
@@ -33,73 +47,90 @@ module DocParser
     # @param range [Range] Range of files to process (nil means process all)
     # @param num_processes [Fixnum] Number of parallel processes
     def initialize(files: [], quiet: false, encoding: 'utf-8', parallel: true,
-                   output: ScreenOutput.new, range: nil,
+                   output: nil, range: nil,
                    num_processes: Parallel.processor_count + 1)
-      @quiet = quiet
-      @parallel = parallel
-      @num_processes = num_processes
+      @num_processes = parallel ? num_processes : 1
+      @files = range ? files[range] : files
       @encoding = encoding
-      if output.is_a? Output
-        @outputs = []
-        @outputs << output
-      elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
-        @outputs = output
-      else
-        raise ArgumentError, 'No outputs specified'
-      end
-      @files = if range
-        files[range]
-      else
-        files
+      Log4r::Logger['docparser'].level = quiet ? Log4r::ERROR : Log4r::INFO
+      unless output.nil?
+        if output.is_a? Output
+          @outputs = []
+          @outputs << output
+        elsif output.is_a?(Array) && output.all? { |o| o.is_a? Output }
+          @outputs = output
+        else
+          raise ArgumentError, 'Invalid outputs specified'
+        end
+        @resultsets = Array.new(@outputs.length) { Set.new }
       end
-      log 'DocParser loaded..'
-      log "#{@files.length} files loaded (encoding: #{@encoding})"
+      @logger =  Log4r::Logger.new('docparser::parser')
+      @logger.info "DocParser v#{VERSION}"
+      @logger.info "#{@files.length} files loaded (encoding: #{@encoding})"
     end
     #
     # Parses the `files`
     #
     def parse!(&block)
-      log "Parsing #{@files.length} files."
+      @logger.info "Parsing #{@files.length} files."
       start_time = Time.now
-      resultsets = Array.new(@outputs.length) { Set.new }
-      if @parallel && @num_processes > 1
-        log "Starting #{@num_processes} processes"
-        Parallel.map(@files, in_processes: @num_processes) do |file|
-          Document.new(file, encoding: @encoding, parser: self).parse!(&block)
-        end.each do |result|
-          result.each_with_index { |set, index| resultsets[index].merge(set) }
-        end
-        log 'Parallel processing finished, writing results..'
+      if @num_processes > 1
+        parallel_process(&block)
       else
-        @files.each do |file|
-          doc = Document.new(file, encoding: @encoding, parser: self)
-          doc.parse!(&block).each_with_index do |set, index|
-            resultsets[index].merge(set)
-          end
+        serial_process(&block)
+      end
+      @logger.info 'Processing finished'
+      write_to_outputs if @outputs
+      @logger.info sprintf('Done processing in %.2fs.', Time.now - start_time)
+    end
+    private
+    def parallel_process(&block)
+      @logger.info "Starting #{@num_processes} processes"
+      Parallel.map(@files, in_processes: @num_processes) do |file|
+        # :nocov: #
+        parse_doc(file, &block)
+        # :nocov: #
+      end.each do |result|
+        result.each_with_index do |set, index|
+          @resultsets[index].merge(set)
+        end if @outputs
+      end
+    end
+    def serial_process(&block)
+      @files.each do |file|
+        parse_doc(file, &block).each_with_index do |set, index|
+          @resultsets[index].merge(set) if @outputs
         end
       end
+    end
-      log "\nSummary\n======="
+    def parse_doc(file, &block)
+      doc = Document.new(filename: file, encoding: @encoding, parser: self)
+      doc.parse!(&block)
+    end
+    def write_to_outputs
+      @logger.info 'Writing data..'
       @outputs.each_with_index do |output, index|
-        resultsets[index].each do |row|
+        @resultsets[index].each do |row|
           output.add_row row
         end
-        resultsets[index] = nil
+        @resultsets[index] = nil
         output.close
-        log output.summary
       end
-      log ''
-      log 'Done processing in %.2fs.' % (Time.now - start_time)
     end
-    private
-    def log(str)
-      puts str unless @quiet
-    end
   end
 end

data/lib/docparser/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module DocParser
   # The current version of DocParser
-  VERSION = '0.0.1'
+  VERSION = '0.1.0'
 end

data/test/lib/docparser/blackbox_test.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require_relative '../../test_helper'
+require 'open3'
+require 'shellwords'
+def cmd_to_sys(command)
+  Open3.popen3(command) do |stdin, stdout, stderr|
+    [stdout.read, stderr.read]
+  end
+end
+describe DocParser do
+  it 'should run the example without problems' do
+    curwd = Dir.getwd
+    Dir.mktmpdir do |dir|
+      Dir.chdir(dir)
+      example_file = Shellwords.escape(File.join($ROOT_DIR, 'example.rb'))
+      out, err = cmd_to_sys '/usr/bin/env ruby ' + example_file
+      err.must_be_empty
+      rows = out.scan(/(\d+) rows/).flatten
+      rows.length.must_equal 5
+      row_lengths = rows.group_by { |elem| elem.to_i }
+      row_lengths.length.must_equal 1
+      # HaD: 40 pages of 7 articles
+      row_lengths.keys.first.must_equal(7 * 40)
+      out.must_match(/Done processing/)
+    end
+    Dir.chdir(curwd)
+  end
+end

data/test/lib/docparser/document_test.rb ADDED Viewed

@@ -0,0 +1,134 @@
+require_relative '../../test_helper'
+describe DocParser::Document do
+  before do
+    Log4r::Logger['docparser'].level = Log4r::INFO
+    $output = DocParser::NilOutput.new
+    @parser = Class.new do
+      define_method(:outputs) { [$output] }
+    end.new
+    @test_doc_path = File.join($SUPPORT_DIR, 'test_html.html')
+    @test_doc = DocParser::Document.new(filename: @test_doc_path,
+                                        parser: @parser)
+  end
+  it 'should read HTML contents' do
+    file = File.join($SUPPORT_DIR, 'test_html.html')
+    doc = DocParser::Document.new(filename: file, parser: @parser)
+    doc.doc.must_be_instance_of Nokogiri::HTML::Document
+    doc.html.must_equal(open(file).read)
+  end
+  it 'should read XML contents' do
+    file = File.join($SUPPORT_DIR, 'test_xml.xml')
+    doc = DocParser::Document.new(filename: file, parser: @parser)
+    doc.doc.must_be_instance_of Nokogiri::XML::Document
+    doc.html.must_equal(open(file).read)
+    doc.xpath_content('xmltest > title').must_equal('Test XML')
+    doc.xpath_content('xmltest > test').must_equal('Character Data')
+  end
+  it 'should read remote contents' do
+    url = 'https://gist.github.com/jurriaan/3f2750aa546e3e6719cf/raw'
+    doc = DocParser::Document.new(filename: url, parser: @parser)
+    doc.html.must_equal(open(url).read)
+  end
+  it 'should use the correct encoding' do
+    file = File.join($SUPPORT_DIR, 'test_encoding.html')
+    file2 = File.join($SUPPORT_DIR, 'test_encoding2.html')
+    doc = DocParser::Document.new(filename: file, parser: @parser)
+    doc2 = DocParser::Document.new(filename: file2,
+                                   parser: @parser,
+                                   encoding: 'iso-8859-1')
+    doc.html.must_equal(doc2.html)
+    doc.css_content('#encoding').must_equal(doc2.css_content('#encoding'))
+  end
+  it 'should specify filename and encoding in #inspect' do
+    @test_doc.inspect.must_include(@test_doc.filename)
+    @test_doc.inspect.must_include(@test_doc.encoding)
+  end
+  it 'should get the title of a document' do
+    @test_doc.title.must_equal('Test HTML')
+  end
+  it 'should store the path to the document' do
+    @test_doc.filename.must_equal(@test_doc_path)
+  end
+  it 'should be possible to use css queries' do
+    css = 'article > h1 + p'
+    css_content = @test_doc.css_content(css)
+    css_element = @test_doc.css(css)
+    css_content.must_equal('Great article it is')
+    css_content.must_equal(css_element.first.content)
+  end
+  it 'should be possible to use xpath queries' do
+    xpath = '//li/ancestor::article/h1'
+    xpath_content = @test_doc.xpath_content(xpath)
+    xpath_element = @test_doc.xpath(xpath)
+    xpath_content.must_equal('This is an article')
+    xpath_content.must_equal(xpath_element.first.content)
+  end
+  it 'should be possible to use regular expressions' do
+    regex = @test_doc.regexp(/\<h1\>([^\<])*/)
+    regex.must_equal(@test_doc.html.match(/\<h1\>([^\<])*/))
+  end
+  it 'should be possible to use blocks on query methods' do
+    array = []
+    @test_doc.css('p') do |element|
+      array << element.content
+    end
+    array.last.must_equal('This is the last paragraph')
+    array2 = []
+    @test_doc.xpath('//p') do |element|
+      array2 << element.content
+    end
+    array2.must_equal(array)
+  end
+  it 'should warn when providing an empty file' do
+    file = Tempfile.new('empty')
+    file.write('')
+    file.close
+    open(file.path).read.empty?.must_equal true
+    _, err = capture_io do
+      # Switch to hijacked IO
+      Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
+      DocParser::Document.new(filename: file.path, parser: @parser)
+    end
+    # Restore IO
+    Log4r::Outputter['docparser'].instance_variable_set(:@out, $stderr)
+    err.must_include "#{file.path} is empty"
+  end
+  it 'should add the row to the results' do
+    @test_doc.add_row ['test']
+    @test_doc.add_row 'test', 'test2'
+    @test_doc.results.must_equal [[['test'], ['test', 'test2']]]
+  end
+  it 'should be possible to specify outputs directly' do
+    @test_doc.add_row ['test!'], output: $output
+    @test_doc.results.must_equal [[['test!']]]
+  end
+  it 'should be possible to use multiple outputs' do
+    output = DocParser::NilOutput.new
+    output2 = DocParser::NilOutput.new
+    parser = Class.new do
+      define_method(:outputs) { [output, output2] }
+    end.new
+    test_doc = DocParser::Document.new(filename: @test_doc_path,
+                                      parser: parser)
+    test_doc.add_row ['a'], output: 1
+    test_doc.add_row ['b'], output: 0
+    test_doc.results.must_equal [[['b']], [['a']]]
+  end
+end

data/test/lib/docparser/logging_test.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require_relative '../../test_helper'
+describe DocParser do
+  it 'should have a valid logger' do
+    logger = Log4r::Logger['docparser']
+    logger.wont_be_nil
+    logger.must_be_instance_of Log4r::Logger
+  end
+  it 'must have the correct loglevel by default' do
+    Log4r::Logger['docparser'].level.must_equal Log4r::INFO
+  end
+  it 'should log to the correct output' do
+    outputters = Log4r::Logger['docparser'].outputters
+    outputters.length.must_equal 1
+    outputters.first.must_be_instance_of Log4r::StdoutOutputter
+  end
+end