textract 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +12 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b777157b81de9ecfc3d2aff6c9a88e8413c23fc
|
4
|
+
data.tar.gz: ed98b845d1819d2fbb0ab9ac8172ba95b308b227
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ba20d4cb668c8da2ee0a53a9585433b98b0c576e67771084345b57393b5a471cc5191db60654fcdfbf335ffc28478f46397460e0c788fb18a53a703d1fda5b9
|
7
|
+
data.tar.gz: 15de095a7a95ffba2a8263b2c67a062a42814c4e7da32045b7be53d1dcc421ebdf5141581607bfbdf87de80fa60b8e8130b73d43bdd1d1cfe2b36451553310ad
|
data/lib/textract/version.rb
CHANGED
data/lib/textract.rb
CHANGED
@@ -16,7 +16,7 @@ module Textract
|
|
16
16
|
OpenGraph.new(html)
|
17
17
|
end
|
18
18
|
|
19
|
-
def self.
|
19
|
+
def self.smart_extract(html, description, selectors)
|
20
20
|
doc = Nokogiri::HTML html
|
21
21
|
if selectors.nil?
|
22
22
|
article = doc.search('article')
|
@@ -72,24 +72,28 @@ module Textract
|
|
72
72
|
attr_reader :md5
|
73
73
|
attr_reader :author
|
74
74
|
|
75
|
-
def initialize(url, selectors)
|
75
|
+
def initialize(url, selectors, format="markdown")
|
76
76
|
@url = url
|
77
77
|
agent = Mechanize.new
|
78
78
|
agent.user_agent_alias = 'Mac Safari'
|
79
79
|
@html = agent.get(url).content
|
80
80
|
@tags = Textract.get_og_tags(@html)
|
81
81
|
|
82
|
-
@article = Textract.
|
83
|
-
|
82
|
+
@article = Textract.smart_extract(@html, @tags.description, selectors)
|
83
|
+
if @article.content.nil?
|
84
|
+
@text = ""
|
85
|
+
else
|
86
|
+
if format == 'markdown'
|
87
|
+
@text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
|
88
|
+
else
|
89
|
+
@text = @article.content
|
90
|
+
end
|
91
|
+
end
|
84
92
|
@md5 = Textract.generate_hash @text
|
85
93
|
@author = @article.author || Textract.get_author(@html)
|
86
94
|
@title = @tags.title || Textract.get_page_title(@html)
|
87
95
|
end
|
88
96
|
|
89
|
-
def to_json
|
90
|
-
to_h.to_json
|
91
|
-
end
|
92
|
-
|
93
97
|
def to_h
|
94
98
|
{
|
95
99
|
url: @url,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Pash
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-04-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opengraph_parser
|