RubyGems - style-scanner - Versions diffs - 0.0.3 - Mend

style-scanner 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

data/.gitignore +19 -0
data/.rspec +0 -0
data/.rvmrc +1 -0
data/Gemfile +4 -0
data/Rakefile +16 -0
data/bin/style +11 -0
data/lib/dictionaries/acronyms.txt +5 -0
data/lib/dictionaries/cliches.txt +680 -0
data/lib/dictionaries/nationalities.txt +185 -0
data/lib/style_scanner/problems/base.rb +45 -0
data/lib/style_scanner/scanner.rb +67 -0
data/lib/style_scanner/sentence.rb +61 -0
data/lib/style_scanner/sentence_scans/adverb.rb +19 -0
data/lib/style_scanner/sentence_scans/base.rb +70 -0
data/lib/style_scanner/sentence_scans/broken_link.rb +27 -0
data/lib/style_scanner/sentence_scans/capitalization.rb +57 -0
data/lib/style_scanner/sentence_scans/cliche.rb +21 -0
data/lib/style_scanner/sentence_scans/consecutively_repeated_word.rb +22 -0
data/lib/style_scanner/sentence_scans/excess_white_space.rb +22 -0
data/lib/style_scanner/sentence_scans/inappropriate_contraction.rb +20 -0
data/lib/style_scanner/sentence_scans/latin_abbreviation.rb +38 -0
data/lib/style_scanner/sentence_scans/passive_tense.rb +32 -0
data/lib/style_scanner/sentence_scans/speaking_in_generalities.rb +17 -0
data/lib/style_scanner/sentence_scans/spelling.rb +22 -0
data/lib/style_scanner/sentence_scans/ugly_word.rb +17 -0
data/lib/style_scanner/sentence_scans/used_word_already_in_sentence.rb +29 -0
data/lib/style_scanner/sentence_scans/useless_word.rb +17 -0
data/lib/style_scanner/string.rb +33 -0
data/lib/style_scanner/tagged_word.rb +58 -0
data/lib/style_scanner/tagger.rb +25 -0
data/lib/style_scanner/version.rb +3 -0
data/lib/style_scanner.rb +17 -0
data/readme.textile +157 -0
data/spec/fixtures/sample_text.txt +2 -0
data/spec/fixtures/stylish/economist/economist-1.txt +29 -0
data/spec/fixtures/stylish/economist/economist-2.txt +21 -0
data/spec/fixtures/stylish/economist/economist-3.txt +9 -0
data/spec/fixtures/stylish/economist/economist-4.txt +23 -0
data/spec/fixtures/stylish/economist/economist-5.txt +15 -0
data/spec/fixtures/stylish/economist/economist-6.txt +37 -0
data/spec/integrations/command_line_spec.rb +41 -0
data/spec/problems/base_spec.rb +38 -0
data/spec/scanner_spec.rb +41 -0
data/spec/sentence_scans/adverb_spec.rb +13 -0
data/spec/sentence_scans/base_spec.rb +18 -0
data/spec/sentence_scans/broken_link_spec.rb +18 -0
data/spec/sentence_scans/capitalization_spec.rb +44 -0
data/spec/sentence_scans/cliche_spec.rb +35 -0
data/spec/sentence_scans/consecutively_repeated_word_spec.rb +26 -0
data/spec/sentence_scans/excess_white_space_spec.rb +22 -0
data/spec/sentence_scans/inappropriate_contraction_spec.rb +21 -0
data/spec/sentence_scans/latin_abbreviation_spec.rb +34 -0
data/spec/sentence_scans/passive_tense_spec.rb +138 -0
data/spec/sentence_scans/speaking_in_generalities_spec.rb +15 -0
data/spec/sentence_scans/spelling_spec.rb +16 -0
data/spec/sentence_scans/ugly_word_spec.rb +29 -0
data/spec/sentence_scans/used_word_already_in_sentence.rb +21 -0
data/spec/sentence_scans/useless_word_spec.rb +14 -0
data/spec/sentence_spec.rb +76 -0
data/spec/spec_helper.rb +26 -0
data/spec/string_spec.rb +30 -0
data/spec/tagged_word_spec.rb +35 -0
data/spec/tagger_spec.rb +14 -0
data/style-scanner.gemspec +30 -0
metadata +263 -0

data/lib/dictionaries/nationalities.txt ADDED Viewed

@@ -0,0 +1,185 @@
+afghans
+albanians
+algerians
+americans
+andorrans
+angolans
+argentines
+armenians
+aromanians
+arubans
+australians
+albanians
+algerians
+andorrans
+angolans
+argentines
+armenians
+aromanians
+arubans
+australians
+austrians
+azeris
+bahamians
+bahrainis
+bangladeshis
+barbadians
+belarusians
+belgians
+belizeans
+bermudians
+boers
+bosnians
+brazilians
+bretons
+britons
+british virgin islanders
+bulgarians
+burkinabès
+burundians
+cambodians
+cameroonians
+canadians
+catalans
+cape verdeans
+chadians
+chileans
+colombians
+comorians
+congolese
+croatians
+cubans
+cypriots
+turkish cypriots
+czechs
+danes
+dominicans
+dominicans
+dutch
+east timorese
+ecuadorians
+egyptians
+emiratis
+english
+eritreans
+estonians
+ethiopians
+finns
+finnish swedish
+fijians
+filipinos
+french citizens
+georgians
+germans
+baltic germans
+ghanaians
+gibraltar
+greeks
+grenadians
+guatemalans
+guianese
+guineans
+guinea-bissau nationals
+guyanese
+haitians
+hondurans
+hong kongers
+hungarians
+icelanders
+indians
+indonesians
+iranians
+iraqis
+irish
+israelis
+italians
+ivoirians
+jamaicans
+japanese
+jordanians
+kazakhs
+kenyans
+koreans
+kosovo albanians
+kuwaitis
+lao
+latvians
+lebanese
+liberians
+libyans
+liechtensteiners
+lithuanians
+luxembourgers
+macedonians
+malawians
+malaysians
+maldivians
+malians
+maltese
+manx
+mauritians
+mexicans
+moldovans
+moroccans
+mongolians
+montenegrins
+mozambicans
+namibians
+nepalese
+new zealanders
+nicaraguans
+nigeriens
+nigerians
+norwegians
+pakistanis
+palauans
+palestinians
+panamanians
+papua new guineans
+paraguayans
+peruvians
+poles
+portuguese
+puerto ricans
+quebecers
+réunionnais
+romanians
+russians
+baltic russians
+rwandans
+salvadorans
+são tomé and príncipe
+saudis
+scots
+senegalese
+serbs
+sierra leoneans
+sikhs
+singaporeans
+slovaks
+slovenes
+somalis
+south africans
+spaniards
+sri lankans
+sudanese
+swedes
+swiss
+syrians
+taiwanese
+tanzanians
+thais
+tibetans
+tobagonians
+trinidadians
+turks
+tuvaluans
+ugandans
+ukrainians
+uruguayans
+venezuelans
+vietnamese
+welsh
+yemenis
+zambians
+zimbabweans

data/lib/style_scanner/problems/base.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# coding: utf-8
+module StyleScanner
+  module Problems
+    class Base
+      attr_reader :offending_text, :sentence
+      def initialize(sentence, offending_text)
+        @sentence = sentence
+        @offending_text = offending_text
+      end
+      def on_text?(problematic_word)
+        offending_text.strip_punctuation == problematic_word.strip_punctuation
+      end
+      def user_friendly_readout
+        [problem_name.red,sentence.text.green,offending_text.yellow].join(" | ")
+      end
+      private
+      def problem_name
+        unformatted_name = self.class.to_s.gsub(/Style::Problems::/,"").titlecase
+        return "Cliché" if unformatted_name == "Cliche"
+        unformatted_name
+      end
+    end
+    def self.problem_class_names_from_dir
+      Dir[(File.dirname(__FILE__) + "/../sentence_scans/*.rb")].
+        map {|filename| File.basename(filename, ".rb").split("_").map(&:capitalize).join } - ["Base"]
+    end
+    def self.dynamically_generate_problem_classes
+      problem_class_names_from_dir.each do |problem_class_name|
+        eval %Q{ class #{problem_class_name} < Base
+                 end}
+      end
+    end
+    dynamically_generate_problem_classes
+  end
+end

data/lib/style_scanner/scanner.rb ADDED Viewed

@@ -0,0 +1,67 @@
+module StyleScanner
+  class Scanner
+    attr_reader :input_text, :sentences, :options
+    attr_accessor :finished_text
+    def initialize(input, options={})
+      # remove html
+      @options = options
+      @input_text = convert_to_txt(input)
+      @sentences = split_into_sentences
+    end
+    def scan
+      sentences.each do |sentence|
+        desired_scans.each do |scanner_type|
+          scanner_type.scan(sentence)
+        end
+         puts sentence.user_friendly_readout
+      end
+    end
+    private
+    def convert_to_txt(input)
+      if options[:html]
+        remove_html(input)
+      elsif options[:textile]
+        textile_to_txt(input)
+      else
+        input
+      end
+    end
+    def textile_to_txt(input)
+      remove_html(RedCloth.new(input).to_html)
+    end
+    def remove_html(input)
+      Sanitize.clean(input)
+    end
+    def desired_scans
+      desired_optional_scans + default_scans
+    end
+    def desired_optional_scans
+      result = []
+      result << SentenceScans::Adverb if options[:adverb]
+      result << SentenceScans::Spelling if options[:spellcheck]
+      result
+    end
+    def split_into_sentences
+      tokenizer = Punkt::SentenceTokenizer.new(input_text)
+      tokenizer.sentences_from_text(input_text, :output => :sentences_text).map {|text| Sentence.new(text)}
+    end
+    def default_scans
+      [SentenceScans::UselessWord, SentenceScans::UglyWord, SentenceScans::ConsecutivelyRepeatedWord,
+      SentenceScans::ExcessWhiteSpace, SentenceScans::BrokenLink, SentenceScans::UsedWordAlreadyInSentence,
+      SentenceScans::SpeakingInGeneralities, SentenceScans::Cliche, SentenceScans::PassiveTense, SentenceScans::Capitalization]
+    end
+  end
+end

data/lib/style_scanner/sentence.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module StyleScanner
+  class Sentence
+    extend Forwardable
+    def_delegators :copy_of_text, :gsub!, :match, :scan, :downcase, :sub!
+    attr_reader :problems, :text
+    def initialize(text)
+      @text = text
+      @problems = []
+    end
+    def find_problems_by_type(problem_type)
+      @problems.select {|problem| problem.class == problem_type}
+    end
+    def tagged_words
+      @tagged_words ||= Tagger.new(text).tagged_words
+    end
+    def adverbs
+      part_of_speech("RB")
+    end
+    def contains?(word, options = {})
+      options = {:strip_case=> true}.merge(options)
+      text_to_scan = text
+      text_to_scan = text_to_scan.downcase if options[:strip_case]
+      text_to_scan = text_to_scan.stem_verbs if options[:stem_verbs]
+      text_to_scan.match /\b#{word}\b/
+    end
+    def user_friendly_readout
+       problems.flatten.map(&:user_friendly_readout) if with_problems?
+    end
+    def add_problem(problem)
+      problems << problem
+    end
+    def with_problems?
+      problems.any?
+    end
+    def to_s
+      "Sentence Obj: text: #{text} problems: #{problems}"
+    end
+    private
+    def part_of_speech(pos)
+      tagged_words.select {|tagged_word| tagged_word.tag == pos }.map(&:word)
+    end
+    # we don't want to modify the original text
+    def copy_of_text
+      text.dup
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/adverb.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module StyleScanner
+  module SentenceScans
+    class Adverb < Base
+      def scan
+        adverbs.each do |adverb|
+          create_problem(adverb)
+        end
+      end
+      private
+      def adverbs
+        sentence.adverbs
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/base.rb ADDED Viewed

@@ -0,0 +1,70 @@
+module StyleScanner
+  module SentenceScans
+    class Base
+      attr_reader :sentence
+      def initialize(sentence)
+        @sentence = sentence
+      end
+      def self.scan(sentence)
+        new(sentence).scan
+      end
+      private
+      def word_pairs
+        # ruby searches for WORD_PAIRS on base class without the following line
+        word_pairs = self.class::WORD_PAIRS
+      end
+      def replacement_word(offending_word)
+        word_pairs[offending_word]
+      end
+      def tokenized_words
+        words.map(&:tokenized).reject {|word| word == ""}
+      end
+      # We retokenize for the text case where no overall scanner is prepared
+      def words
+        sentence.tagged_words
+      end
+      def next_word(word)
+        words.at(words.index(word) + 1)
+      end
+      def next_significant_word(word)
+        possible_word = next_word(word)
+        return next_significant_word(possible_word) if possible_word.adverb? || possible_word.preposition? || possible_word.determiner?
+        possible_word
+      end
+      def already_has_that_problem_on_text(offending_text)
+        sentence.find_problems_by_type(problem_class).any? do |problem|
+          problem.on_text?(offending_text)
+        end
+      end
+      def problem_class
+        Problems.const_get(self.class.to_s.gsub("StyleScanner::SentenceScans::", ""))
+      end
+      def create_problem(offending_text)
+        sentence.add_problem(problem_class.new(sentence, offending_text)) unless already_has_that_problem_on_text(offending_text)
+      end
+      class << self
+         def load_file(filename)
+           file_location = File.expand_path("../../../dictionaries/#{filename}", __FILE__)
+           IO.read(file_location).split("\n")
+         end
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/broken_link.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module StyleScanner
+  module SentenceScans
+    class BrokenLink < Base
+      URL_REGEX = /(?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?/
+        def scan
+          links = sentence.scan(URL_REGEX)
+          links.each do |url|
+            begin
+              attempt_to_visit_url(url)
+              # socket error occurs if link is bad
+            rescue SocketError, Errno::ECONNREFUSED
+              create_problem("Url #{url} does not work")
+            end
+          end
+        end
+      private
+      def attempt_to_visit_url(url)
+        Net::HTTP.get_response(URI.parse(url))
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/capitalization.rb ADDED Viewed

@@ -0,0 +1,57 @@
+module StyleScanner
+  module SentenceScans
+    class Capitalization < Base
+      ACRONYMS = load_file("acronyms.txt")
+      NATIONALITIES = load_file("nationalities.txt")
+      MONTHS = %w(
+     january february march april may june july august september october november december
+      )
+      DAYS = %w(
+        monday tuesday wednesday thursday friday saturday sunday
+      )
+      SEASONS = %w(
+       Winter
+       Summer
+       Spring
+       Autumn
+      )
+      def scan
+        flag_lowercase(MONTHS)
+        flag_lowercase(ACRONYMS)
+        flag_lowercase(DAYS)
+        flag_lowercase(NATIONALITIES)
+        flag_uppercase(SEASONS)
+        create_problem(first_letter) if first_letter_is_lowercase?
+      end
+      private
+      def first_word
+        words.first
+      end
+      def first_letter
+        first_word.word.chars.first
+      end
+      def first_letter_is_lowercase?
+        first_letter != first_letter.upcase
+      end
+      def flag_uppercase(collection)
+        collection.each do |word|
+          create_problem(word.downcase) if sentence.contains?(word, :strip_case => false)
+        end
+      end
+      def flag_lowercase(collection)
+        collection.each do |word|
+          create_problem(word.upcase) if sentence.contains?(word, :strip_case => false)
+        end
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/cliche.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module StyleScanner
+  module SentenceScans
+    class Cliche < Base
+      CLICHES = load_file("cliches.txt")
+      def scan
+        Cliche.stemmed_cliches.each.with_index do |cliche, index|
+          create_problem(CLICHES[index]) if sentence.contains?(cliche, :stem_verbs => true)
+        end
+      end
+      def self.stemmed_cliches
+        @@stemmed_cliches ||= CLICHES.map do |cliche|
+          cliche.stem_verbs
+        end
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/consecutively_repeated_word.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module StyleScanner
+  module SentenceScans
+    class ConsecutivelyRepeatedWord < Base
+      REPEATED_WORD_REGEX = /\b(\w+)\b\s+\b\1\b/
+      def scan
+        consecutively_repeated_words.each do |repeated_word|
+          create_problem("#{repeated_word} #{repeated_word}")
+        end
+      end
+      private
+      def consecutively_repeated_words
+        sentence.downcase.scan(REPEATED_WORD_REGEX).flatten
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/excess_white_space.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module StyleScanner
+  module SentenceScans
+    class ExcessWhiteSpace < Base
+      def scan
+        white_space_problems.each do |problem|
+          create_problem(problem.post_match)
+        end
+      end
+      private
+      def white_space_problems
+        between_words = sentence.match /\s{2,}/
+        before_full_stop = sentence.match /\s{1,}\./
+        before_commas = sentence.match /\s{1,}\,/
+        [between_words, before_full_stop, before_commas].compact.flatten
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/inappropriate_contraction.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module StyleScanner
+  module SentenceScans
+    class InappropriateContraction < Base
+      WORD_PAIRS = {"don't" => "do not", "can't" => 'cannot',
+        "won't" => "will not", "shan't" => "shall not",
+        "hasn't" => "has not", "i'm" => "I am", "he'll" => "he will",
+        "she'll" => "she will", "didn't" => "did not",
+        "shouldn't" => "should not", "could've" => "could have",
+        "they'll" => "they will", "we'll" => "we will"}
+      def scan
+        WORD_PAIRS.keys.each do |offender|
+          create_problem(replacement_word(offender)) if sentence.contains?(offender)
+        end
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/latin_abbreviation.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module StyleScanner
+  module SentenceScans
+    class LatinAbbreviation < Base
+      LATINS = ["ie", "etc", "cf", "et cetera", "ergo",
+        "c", "cf", "ibid", "dto", "et al", "et seq", "vs",
+        "re", "nb"]
+      def scan
+        dot_placement_permutations(LATINS).each do |latin_abbreviation|
+          if sentence.contains?(latin_abbreviation)
+            create_problem(latin_abbreviation)
+          end
+        end
+      end
+      private
+      def dot_placement_permutations(abbreviations)
+        abbreviations.map do |abbr|
+          [abbr, dot_at_end(abbr), dot_between_every_letter(abbr)]
+        end.flatten
+      end
+      def dot_at_end(abbr)
+        "#{abbr}."
+      end
+      # this method overshoots in permuations for some latins, but it
+      # shouldn't matter
+      def dot_between_every_letter(abbr)
+        dot_between_all_but_last = abbr.split("").join(".")
+        [dot_between_all_but_last, dot_at_end(dot_between_all_but_last)]
+      end
+    end
+  end
+end

data/lib/style_scanner/sentence_scans/passive_tense.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module StyleScanner
+  module SentenceScans
+    class PassiveTense < Base
+      # heuristic: A "BE" verb follwed by a verb other than a gerund
+      def scan
+        passives.each do |passive|
+          create_problem(word_in_context(passive))
+        end
+      end
+      private
+      def passives
+        words.find_all do |word|
+          word.be_verb? && (! next_word(word).gerund_verb? ) && (! state_word?(word))
+        end
+      end
+      def word_in_context(main_word)
+        position_of_main_word = words.index(main_word)
+        words[position_of_main_word-1, 3].map(&:word).join(" ")
+      end
+      def state_word?(word)
+        word = next_significant_word(word)
+        word.noun? || word.possessive?
+      end
+    end
+  end
+end