extractula 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,12 +3,13 @@ $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirna
3
3
  module Extractula; end
4
4
 
5
5
  require 'nokogiri'
6
+ require 'loofah'
6
7
  require 'domainatrix'
7
8
  require 'extractula/extracted_content'
8
9
  require 'extractula/extractor'
9
10
 
10
11
  module Extractula
11
- VERSION = "0.0.4"
12
+ VERSION = "0.0.5"
12
13
 
13
14
  @extractors = []
14
15
 
@@ -7,14 +7,13 @@ class Extractula::ExtractedContent
7
7
 
8
8
  def summary
9
9
  return @summary if @summary
10
- @content_doc ||= Nokogiri::HTML(@content)
11
- content_fragment = @content_doc.inner_text.slice(0, 350)
10
+ content_fragment = Loofah.scrub_document(@content, :prune).text.gsub("\\n", " ").gsub(/\s+/, " ").slice(0, 350).strip
12
11
  sentence_break = content_fragment.rindex(/\?|\.|\!|\;/)
13
12
  if sentence_break
14
13
  @summary = content_fragment.slice(0, sentence_break + 1)
15
14
  @summary
16
15
  else
17
- @summary = content_fragment
16
+ @summary = content_fragment.gsub(/\s\w+$/, "...")
18
17
  end
19
18
  end
20
19
 
@@ -23,6 +23,10 @@ describe "extracted content" do
23
23
  extracted = Extractula::ExtractedContent.new(:content => "<p>I've been quietly working on Typhoeus for the last few months. With the help of <a href=\"http://metaclass.org/\">Wilson Bilkovich</a> and <a href=\"http://github.com/dbalatero\">David Balatero</a> I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity.</p>\n<p>It's really slick and includes improved stubing support, caching, memoization, and (of course) parallelism. The <a href=\"http://github.com/pauldix/typhoeus/\">Typhoeus readme</a> highlights all of the awesomeness. It should be noted that the old interface of including Typhoeus into classes and defining remote methods has been deprecated. I'll be removing that sometime in the next six months.</p>\n<p>In addition to thanking everyone using the library and everyone contributing, I should also thank my employer kgbweb. If you're a solid Rubyist that likes parsing, crawling, and stuff, or a machine learning guy, or a Solr/Lucene/indexing bad ass, let me know. We need you and we're doing some crazy awesome stuff.</p>")
24
24
  extracted.summary.should == "I've been quietly working on Typhoeus for the last few months. With the help of Wilson Bilkovich and David Balatero I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity."
25
25
  end
26
+
27
+ it "cleans script tags and their content" do
28
+ Extractula::ExtractedContent.new(:content => read_test_file("script_tag_remove_case.html")).summary.should == "Obama to meet with House Republicans By Perry Bacon Jr. Washington Post Staff Writer Tuesday, January 26, 2010; A13 President Obama will meet Friday with perhaps his harshest critics outside of Fox News headquarters: the House Republicans."
29
+ end
26
30
  end
27
31
 
28
32
  describe "image_urls" do
@@ -0,0 +1 @@
1
+ <div style="float:right;padding-left:17px;">\n<script>\n<!--\nif ( show_doubleclick_ad && ( adTemplate & BIGBOX_FLEX ) == BIGBOX_FLEX )\n{\ndocument.write('<div style="margin-top:4px; margin-bottom:4px;clear:left;">') ;\ndocument.writeln ('<div><img src="http://media3.washingtonpost.com/wp-srv/hp/img/ad_label_leftjust.gif" alt="ad_icon" width="100" height="13" border="0"/></div>' );\n}\n// -->\n</script><script>\nif ( show_doubleclick_ad && ( adTemplate & BIGBOX_FLEX ) == BIGBOX_FLEX )\n{\nplaceAd('ARTICLE',commercialNode,5,'',true) ;\n}\n</script><script language="javascript">\n<!--\nif ( show_doubleclick_ad && ( adTemplate & BIGBOX_FLEX ) == BIGBOX_FLEX )\n{\ndocument.write('</div>') ;\n}\n// -->\n</script>\n</div>\n<style>\n.correction {\nmargin-top:8px;\npadding-top:10px;\nmargin-bottom:8px;\nborder-bottom:1px solid #CCCCCC;\npadding-bottom:10px;\nfont-family:arial,sans-serif;\nfont-size:11px;\ncolor:#333333;\n}\n.correction strong {\ncolor:#CC0000;\ntext-transform:uppercase;\n}\n</style>\n<div style="margin-right:165px;">\n</div>\n<font size="+2"><b>Obama to meet with House Republicans</b></font><br><p>\n<font size="-1">\nBy Perry Bacon Jr.<br>\nWashington Post Staff Writer<br>\nTuesday, January 26, 2010;\nA13\n<br></font>\n</p>\n<p>\n</p>\n<p>\n<a href="http://www.whorunsgov.com/Profiles/Barack_Obama" target="">President Obama</a> will meet Friday with perhaps his harshest critics outside of Fox News headquarters: the House Republicans.\n</p>\n<p>\nThe House <a href="http://projects.washingtonpost.com/politicsglossary/party-affiliated/GOP/" target="">GOP</a> invited Obama this year to speak at its annual retreat, which will be held in Baltimore from Thursday to Saturday. Coming only two days after Obama's State of the Union address, the session could herald better relations between the two sides in 2010 -- or lift their tensions to an even higher level.\n</p>\n<p>\nThe White House and congressional Republicans spent much of last year bickering over whom to blame for their inability to work together, as the administration constantly blasted the House <a href="http://projects.washingtonpost.com/politicsglossary/party-affiliated/GOP/" target="">GOP</a> for unanimously opposing the economic stimulus, while Republicans said Obama and House Democrats refused to incorporate their ideas. A private meeting at the White House that included Obama and House Republicans in December on job growth turned into a griping session, with the president accusing <a href="http://projects.washingtonpost.com/politicsglossary/party-affiliated/Republican-Party/" target="">the GOP</a> of "scaring" Americans about his policies while Republicans said the anxiety in the country stemmed from his agenda.\n</p>\n<p>\nSo far this year, nothing has changed. House Republicans have said Obama's policies led to the defeat of Democrat Martha Coakley in the special Senate election in Massachusetts. White House advisers, in turn, have blamed <a href="http://projects.washingtonpost.com/politicsglossary/party-affiliated/Republican-Party/" target="">the GOP</a> for the negative tone of Washington politics.\n</p>\n<p>\nRep. <a href="http://www.whorunsgov.com/Profiles/Mike_Pence" target="">Mike Pence</a> (Ind.), the No. 3 in the House <a href="http://projects.washingtonpost.com/politicsglossary/party-affiliated/GOP/" target="">GOP</a> leadership and the organizer of the retreat, said House Republicans wanted a stronger relationship with Obama and said <a href="http://projects.washingtonpost.com/politicsglossary/party-affiliated/Republican-Party/" target="">the GOP</a>'s goals of working with Obama and winning this fall's elections are not in conflict. "We serve our party best when we serve our country," he said. But he added that "the conversation with the president has to be a two-way street."\n</p>\n<p>\nIn addition to Obama, the House GOP will hear from Virginia Gov. <a href="http://www.whorunsgov.com/Profiles/Robert_F._McDonnell" target="">Robert F. McDonnell</a>, one of the party's new stars, as well as former House speaker <a href="http://www.whorunsgov.com/Profiles/Newt_Gingrich" target="">Newt Gingrich</a> and former House <a href="http://projects.washingtonpost.com/politicsglossary/Congressional/majority-leader/" target="">majority leader</a> Richard K. Armey, who heads up the conservative activist group FreedomWorks. Party leaders said they will focus on discussing a policy agenda for their candidates in the <a href="http://projects.washingtonpost.com/politicsglossary/election/midterm-election/" target="">midterm elections</a>.\n</p>\n<p>\nLast year's retreat was at the Homestead in Hot Springs, Va. This year, worried about the appearance of a staying at a posh hotel as unemployment hovers over 10 percent, the Republicans have opted for a Marriott near the Inner Harbor. Earlier this month, Democrats eschewed holding a retreat at a luxury resort and heard from experts and the president in the Capitol's visitor center.\n</p>\n<b>'Maybe I'm a masochist'</b><br><p>\nWhile he deals with a energized GOP, Obama will also face an increasingly anxious left of his party in Congress. The Progressive Caucus, a group of more than 80 of the most liberal members in Congress, says Republican Scott Brown's upset victory in Massachusetts was not because Obama and Democrats were too liberal, but because they were insufficiently so.\n</p>\n<p>\n"I don't think it was about health care, it was because change didn't happen fast enough -- that's the frustration," said <a href="http://www.whorunsgov.com/Profiles/Lynn_Woolsey" target="">Rep. Lynn Woolsey</a> (D-Calif.), one of the group's leaders. "I believe that if we had pursued the populist, progressive agenda, such as a public option, we could have energized our base."\n</p>\n<p>\nA Washington Post-Kaiser-Harvard poll of Massachusetts voters conducted after Brown's election showed that young and minority voters, who formed the backbone of Obama's support in 2008, represented a smaller percentage of the electorate in last Tuesday's special election. It's not clear whether policy issues or Obama's absence from the ballot caused some of these voters not to go to the polls.\n</p>\n<p>\nWhatever the reason for the Massachusetts loss, Rep. Raul <a href="http://www.whorunsgov.com/Profiles/Raul_M._Grijalva" target="">Raul Grijalva</a> (D-Ariz.), leader of the Progressive Caucus, has outlined an agenda for 2010 that he says will appeal to the base: increased funding for education, a job-creation bill bigger than the $154 billion version that passed the House in December over the objections of many Democratic moderates, and immigration reform. The latter in particular is unlikely to pass this year.\n</p>\n<p>\n"We are going to push," he said. "Maybe I'm masochist, but I'm still optimistic."\n</p>\n<b>Self-evident truths?</b><br><p>\nThe tea party is coming to Capitol Hill. Hours before the president's speech on Wednesday, <a href="http://www.whorunsgov.com/Profiles/Michele_Bachmann" target="">Rep. Michele Bachmann</a> (R-Minn.), one of the lawmakers most closely allied with the movement, and FreedomWorks will hold an event with conservative activists and lawmakers to tout a "Declaration of Health Care Independence." An aide to Bachmann said the proposal would "protect the rights of the American to make their own health decisions," as well as include 10 conservative ideas for future health reform.\n</p>\n<p>\nThe health-care event is one of the first steps the tea-party movement will take this year as it seeks to expand its influence. At a news conference Monday, FreedomWorks put out a list of candidates it is backing or opposing in key races this year. Florida <a href="http://www.whorunsgov.com/Profiles/Charlie_Crist" target="">Gov. Charlie Crist</a> (R), a candidate for the Senate; <a href="http://www.whorunsgov.com/Profiles/Harry_M._Reid" target="">Sen. Harry Reid</a> (D-Nev.); and <a href="http://www.whorunsgov.com/Profiles/Alan_Grayson" target="">Rep. Alan Grayson</a> (D-Fla.) each are labeled an "Enemy of Liberty" whom the group will oppose. FreedomWorks will back GOP Senate candidates Marco Rubio (Fla.), <a href="http://www.whorunsgov.com/Profiles/Patrick_Toomey" target="">Pat Toomey</a> (Pa.) and Rand Paul (Ky.) -- each, according to the group, is a "Champion of Freedom."\n</p>\n<p>\n<i>In Session is a weekly look inside Congress.</i>\n</p>\n<p>\n</p>\n<p>\n</p>\n<script>\nvar comments_url = "http://www.washingtonpost.com/wp-dyn/content/article/2010/01/25/AR2010012503691_Comments.html" ;\nvar article_id = "AR2010012503691" ;\n</script><span class="display:none;" name="pubDate" id="pubDate" value="1264482000000"></span>\n<link href="http://www.washingtonpost.com/wp-srv/css/commentslinks.css" rel="stylesheet" media="all">\n<script>\n<!--\nvar COMMENTS_ALLOWED = false ;\nvar COMMENTS_ACTIVE = false ;\n\nvar comments_period = ( typeof wp_article != "undefined" && typeof wp_article.comments_period != "undefined" && ( wp_article.comments_period != "" || wp_article.comments_period == "0" ) && wp_article.comments_period >= 0 ) ? wp_article.comments_period : 3 ;\n\narticleCommentsUrl = document.location.href;\nif ( typeof comments_url == 'undefined' ) {\n\tvar article_pathname = document.location.pathname;\n\tvar comments_url = article_pathname.split(".")[0]+"_Comments."+article_pathname.split(".")[1];\n}\nif ( typeof article_id == 'undefined' ) {\n\tvar article_id = articleCommentsUrl.split("/").pop(); \n\tvar article_id = article_id.split(".")[0];\n\tarticle_id = article_id.replace(/(.*)_(\\d+|\\w+)/, function(match,submatch1,offset,string) { return submatch1; } ) ;\n}\n\nfunction checkDaysOld(daysOld) {\n\tvar todayString = 'January 27, 2010';\n\tvar today = new Date(todayString).getTime();\n\n\tvar daysOld = 86400 * daysOld * 1000;\n\tvar pubDate = document.getElementById("pubDate");\n\tif(pubDate != null) {\n\t\tpubDate = pubDate.getAttribute("value");\n\t\treturn (today - pubDate > daysOld )?true:false;\n\t} else {\n\t\treturn false;\n\t}\n}\n\nif ( typeof thisNode == 'undefined' )\n\tthisNode = 'admin' ;\ncmt_ancestor = thisNode.split('/')[0] ;\n\n// (black list) && (white list) of ancestors and sections goes here\nif ( \t!( thisNode.match(/\\/wires$/) || thisNode.match(/^artsandliving\\/(entertainmentguide|entertainmentnews|travel\\/index)($|\\/)/) || thisNode.match(/^business\\/(portfolio)($|\\/)/) || thisNode.match(/^metro\\/(obituaries)($|\\/)/) )\n\t\t\t&&\n\t\t( thisNode.match(/^(artsandliving|business|cars|education|health|jobs|media|metro|nation|realestate|religion|politics|sports|technology|world|kidspost|media)($|\\/)/) || thisNode.match(/^print\\/(washpostmagazine|style|sundayarts|sundaysource)($|\\/)/) || thisNode.match(/^opinions($|\\/)/) || thisNode.match(/^opinion\\/(columns)($|\\/)/) ))\n{\n\tCOMMENTS_ALLOWED = true ;\n\tif(!checkDaysOld(comments_period) ) {\n\t\tdocument.write("<style>#ArticleCommentsWrapper {display:block};</style>"); \n\t\tCOMMENTS_ACTIVE = true ;\n\t} else {\n\t\tdocument.write('<p class="posted"><a href="'+comments_url+'">View all comments</a> that have been posted about this article.</p>');\n\t}\n} else {\n\tdocument.write('');\n}\n// -->\n</script><script>\n<!--\nfunction getDisplayUserName()\n{\n\tvar output = 'Your washingtonpost.com User ID' ;\n\t//check to the see if the user is signed in\n\tif (document.cookie.indexOf("wpniuser") != -1)\n\t{\n\t\tvar start = (document.cookie.indexOf("wpniuser") + 9) ;\n\t\tvar end = (document.cookie.indexOf(";",start)) == -1 ? document.cookie.length : document.cookie.indexOf(";",start) ;\n\t\tcookieuser = document.cookie.substring(start,end) ;\n\n\t\t// show their username and registration links\n\t\tif ( cookieuser.indexOf("@") != -1)\n\t\t\tcookieuser = cookieuser.substring(0,cookieuser.indexOf("@")) ;\n\n\t\t// cookieuser = cookieuser.trim() ;\n\t\tif ( cookieuser )\n\t\t\toutput += (', '+cookieuser+',') ;\n\t}\n\toutput += ' will be displayed with your comment.' ;\n\treturn output ;\n}\n// -->\n</script><div id="ArticleCommentsWrapper">\n<div class="comments">\n<div class="hdr">\n<div style="float:left;"><p class="action">Post a Comment</p></div>\n<div style="float:right;"><script src="http://www.washingtonpost.com/wp-srv/ad/comments_box.js"></script></div>\n<div style="clear:both;"></div>\n<br><script>\ndocument.write('<p class="posted"><a href="'+comments_url+'">View all comments</a> that have been posted about this article.</p>');\n</script><div id="comment-data-pluck">\n<p style="width:400px;"></p>\n\n<script language="JavaScript">\ntry{document.domain="washingtonpost.com";}catch(e){};\n</script><script type="text/javascript" language="javascript" src="http://community.washingtonpost.com/ver1.0/SiteLifeProxy"></script><script>\n</script><script language="JavaScript">gSiteLife.CommentsInput("ExternalResource",article_id,comments_url);</script>\n</div>\n\n<div class="clearboth"></div>\n\n<p>Comments that include profanity or personal attacks or other inappropriate comments or material will be removed from the site. Additionally, entries that are unsigned or contain "signatures" by someone other than the actual author will be removed. Finally, we will take steps to block users who violate any of our posting standards, terms of use or privacy policies or any other policies governing this site. Please review the <a href="http://www.washingtonpost.com/wp-srv/liveonline/delphi/delphirules.htm">full rules</a> governing commentaries and discussions. You are fully responsible for the content that you post.</p>\n\n<div class="clearboth"></div>\n</div>\n</div>\n</div>\n<div align="center"> <script src="http://www.washingtonpost.com/wp-adv/adproducts/advertisingLinks/advertisingLinks_v2.js"></script>\n</div>\n<!-- start the copyright for the articles -->\n<div id="articleCopyright" style="clear:both;" align="center">\302\251\302\2402010\302\240The Washington Post Company</div>\n<!-- end the copyright for the aricles --><!-- start the copyright for the secions --><!-- end the copyright for the secions -->
metadata CHANGED
@@ -1,10 +1,11 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: extractula
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Dix
8
+ - Sander Hartlage
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
@@ -22,6 +23,16 @@ dependencies:
22
23
  - !ruby/object:Gem::Version
23
24
  version: 0.0.0
24
25
  version:
26
+ - !ruby/object:Gem::Dependency
27
+ name: loofah
28
+ type: :runtime
29
+ version_requirement:
30
+ version_requirements: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: 0.4.2
35
+ version:
25
36
  description:
26
37
  email: paul@pauldix.net
27
38
  executables: []
@@ -54,6 +65,7 @@ files:
54
65
  - spec/test-files/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video.html
55
66
  - spec/test-files/nytimes.html
56
67
  - spec/test-files/nytimes_story.html
68
+ - spec/test-files/script_tag_remove_case.html
57
69
  has_rdoc: true
58
70
  homepage: http://github.com/pauldix/extractula
59
71
  licenses: []