artext 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/Gemfile.lock +30 -33
- data/Rakefile +3 -1
- data/artext.gemspec +5 -5
- data/lib/artext.rb +13 -4
- data/lib/artext/version.rb +1 -1
- metadata +32 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2b5fad951364d5b4d72143b5306bc59138b8ecbc120f34acd826aa7a64694e7c
|
4
|
+
data.tar.gz: 71c65f3cb3d0a7451dab3703050cd788454bdfba65981a8e633dc5f62808b059
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef346ac934aeeb5ad7960b343ab7fe07206ba97da57dd16c02f686028c3ff4678b21be3852dcde619188bbaf5881ef63d89540f7686e924c3bc9267e5b9ca96b
|
7
|
+
data.tar.gz: 99b81500ee6ba0aac5a43cf54a3991f07e5856232a6cac49567c242b8195eaae72c49d9d6a4e8f390ae3836f5667bb977adfa944b11d01f1af6be88f57ad936b
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.DS_STORE
|
data/Gemfile.lock
CHANGED
@@ -1,45 +1,42 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
artext (0.0.
|
5
|
-
addressable (~> 2.
|
6
|
-
fastimage (~> 1
|
7
|
-
httparty (~> 0.
|
8
|
-
mini_magick (~>
|
9
|
-
nokogiri (~> 1.
|
4
|
+
artext (0.0.4)
|
5
|
+
addressable (~> 2.5)
|
6
|
+
fastimage (~> 2.1)
|
7
|
+
httparty (~> 0.14)
|
8
|
+
mini_magick (~> 4.8)
|
9
|
+
nokogiri (~> 1.8)
|
10
10
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
13
13
|
specs:
|
14
|
-
addressable (2.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
httparty (0.
|
19
|
-
json (~> 1.8)
|
14
|
+
addressable (2.5.2)
|
15
|
+
public_suffix (>= 2.0.2, < 4.0)
|
16
|
+
diff-lcs (1.3)
|
17
|
+
fastimage (2.1.4)
|
18
|
+
httparty (0.16.2)
|
20
19
|
multi_xml (>= 0.5.2)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
rspec-
|
31
|
-
rspec-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
rspec-expectations (3.3.1)
|
20
|
+
mini_magick (4.9.2)
|
21
|
+
mini_portile2 (2.3.0)
|
22
|
+
multi_xml (0.6.0)
|
23
|
+
nokogiri (1.8.5)
|
24
|
+
mini_portile2 (~> 2.3.0)
|
25
|
+
public_suffix (3.0.3)
|
26
|
+
rake (10.5.0)
|
27
|
+
rspec (3.8.0)
|
28
|
+
rspec-core (~> 3.8.0)
|
29
|
+
rspec-expectations (~> 3.8.0)
|
30
|
+
rspec-mocks (~> 3.8.0)
|
31
|
+
rspec-core (3.8.0)
|
32
|
+
rspec-support (~> 3.8.0)
|
33
|
+
rspec-expectations (3.8.2)
|
36
34
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
-
rspec-support (~> 3.
|
38
|
-
rspec-mocks (3.
|
35
|
+
rspec-support (~> 3.8.0)
|
36
|
+
rspec-mocks (3.8.0)
|
39
37
|
diff-lcs (>= 1.2.0, < 2.0)
|
40
|
-
rspec-support (~> 3.
|
41
|
-
rspec-support (3.
|
42
|
-
subexec (0.2.3)
|
38
|
+
rspec-support (~> 3.8.0)
|
39
|
+
rspec-support (3.8.0)
|
43
40
|
|
44
41
|
PLATFORMS
|
45
42
|
ruby
|
@@ -51,4 +48,4 @@ DEPENDENCIES
|
|
51
48
|
rspec (~> 3.3)
|
52
49
|
|
53
50
|
BUNDLED WITH
|
54
|
-
1.
|
51
|
+
1.16.3
|
data/Rakefile
CHANGED
data/artext.gemspec
CHANGED
@@ -23,9 +23,9 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "rspec", "~> 3.3"
|
24
24
|
|
25
25
|
|
26
|
-
spec.add_dependency "addressable", "~> 2.
|
27
|
-
spec.add_dependency "httparty", "~> 0.
|
28
|
-
spec.add_dependency "fastimage", "~> 1
|
29
|
-
spec.add_dependency "mini_magick", "~>
|
30
|
-
spec.add_dependency "nokogiri", "~> 1.
|
26
|
+
spec.add_dependency "addressable", "~> 2.5"
|
27
|
+
spec.add_dependency "httparty", "~> 0.14"
|
28
|
+
spec.add_dependency "fastimage", "~> 2.1"
|
29
|
+
spec.add_dependency "mini_magick", "~> 4.8"
|
30
|
+
spec.add_dependency "nokogiri", "~> 1.8"
|
31
31
|
end
|
data/lib/artext.rb
CHANGED
@@ -4,6 +4,7 @@ require "httparty"
|
|
4
4
|
require "nokogiri"
|
5
5
|
require "mini_magick"
|
6
6
|
require "fastimage"
|
7
|
+
require "addressable"
|
7
8
|
|
8
9
|
module Artext
|
9
10
|
|
@@ -121,7 +122,7 @@ module Artext
|
|
121
122
|
if (article.count > 1)
|
122
123
|
article = get_correct_article(article)
|
123
124
|
score = 0.8
|
124
|
-
end
|
125
|
+
end
|
125
126
|
if (is_blank?(article))
|
126
127
|
article = find_article(doc)
|
127
128
|
score = 0.6
|
@@ -173,7 +174,7 @@ module Artext
|
|
173
174
|
end
|
174
175
|
return rel, score
|
175
176
|
end
|
176
|
-
|
177
|
+
|
177
178
|
def self.iteratively_clean(element, html, images, score)
|
178
179
|
html = ""
|
179
180
|
imgs = []
|
@@ -227,15 +228,17 @@ module Artext
|
|
227
228
|
elsif (element.name == "p")
|
228
229
|
p_elem, ti = extractp(element, score)
|
229
230
|
tv = "<p>#{p_elem}</p>" if (!is_blank?(p_elem))
|
231
|
+
tv = tv.gsub("<p></p>", "")
|
230
232
|
images = images + ti if (!is_blank?(ti))
|
231
233
|
elsif (element.name == "figure")
|
232
|
-
cap = element.search("figcaption").
|
234
|
+
cap = element.search("figcaption").inner_html
|
233
235
|
cap = is_blank?(cap) ? "" : "<figcaption>#{cap}</figcaption>"
|
234
236
|
tv, ti = figurehandle(element, "", [])
|
235
237
|
tv = "<figure>#{tv}#{cap}</figure>" if (!is_blank?(tv))
|
236
238
|
images = images + ti
|
237
239
|
elsif (element.name == "text")
|
238
240
|
tv = element.text.split.join(" ")
|
241
|
+
tv = nil if tv == "advertisement"
|
239
242
|
tv = "<p class\"inline\">#{tv}</p>" if (!is_blank?(tv))
|
240
243
|
elsif (element.name == "i")
|
241
244
|
tv = element.text.split.join(" ")
|
@@ -292,7 +295,7 @@ module Artext
|
|
292
295
|
end
|
293
296
|
|
294
297
|
def self.phandle(element, html, images)
|
295
|
-
if (!is_blank?(element.children) && !(element.name == "a" && is_blank?(element.search("img"))))
|
298
|
+
if (!is_blank?(element.children) && !((element.name == "a" || element.name == "figure") && is_blank?(element.search("img"))))
|
296
299
|
element.children.each do |elem|
|
297
300
|
html, images = phandle(elem, html, images)
|
298
301
|
end
|
@@ -303,6 +306,12 @@ module Artext
|
|
303
306
|
html = "</p><figure><img src=\"#{img}\"></figure><p>"
|
304
307
|
images << img
|
305
308
|
end
|
309
|
+
elsif (element.name == "figure")
|
310
|
+
cap = element.search("figcaption").inner_html
|
311
|
+
cap = is_blank?(cap) ? "" : "<figcaption>#{cap}</figcaption>"
|
312
|
+
tv, ti = figurehandle(element, "", [])
|
313
|
+
html = "</p><figure>#{tv}#{cap}</figure><p>" if (!is_blank?(tv))
|
314
|
+
images << ti if (!is_blank?(ti))
|
306
315
|
elsif (element.name == "a")
|
307
316
|
html = html + " <a href=\"#{element.attribute("href").value if (!is_blank?(element.attribute("href")))}\">#{element.text.split.join(" ")}</a> "
|
308
317
|
elsif (element.name == "text")
|
data/lib/artext/version.rb
CHANGED
metadata
CHANGED
@@ -1,127 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: artext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Anindya Mondal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '3.3'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.3'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: addressable
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - ~>
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '2.
|
61
|
+
version: '2.5'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - ~>
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '2.
|
68
|
+
version: '2.5'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: httparty
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - ~>
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0.
|
75
|
+
version: '0.14'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - ~>
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0.
|
82
|
+
version: '0.14'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: fastimage
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - ~>
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '1
|
89
|
+
version: '2.1'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - ~>
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '1
|
96
|
+
version: '2.1'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: mini_magick
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - ~>
|
101
|
+
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '4.8'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- - ~>
|
108
|
+
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
110
|
+
version: '4.8'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: nokogiri
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
|
-
- - ~>
|
115
|
+
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: '1.
|
117
|
+
version: '1.8'
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- - ~>
|
122
|
+
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: '1.
|
124
|
+
version: '1.8'
|
125
125
|
description: Extract article and other metadata from websites.
|
126
126
|
email:
|
127
127
|
- anindyamondal@mazdigital.com
|
@@ -129,6 +129,7 @@ executables: []
|
|
129
129
|
extensions: []
|
130
130
|
extra_rdoc_files: []
|
131
131
|
files:
|
132
|
+
- ".gitignore"
|
132
133
|
- Gemfile
|
133
134
|
- Gemfile.lock
|
134
135
|
- LICENSE.txt
|
@@ -147,17 +148,17 @@ require_paths:
|
|
147
148
|
- lib
|
148
149
|
required_ruby_version: !ruby/object:Gem::Requirement
|
149
150
|
requirements:
|
150
|
-
- -
|
151
|
+
- - ">="
|
151
152
|
- !ruby/object:Gem::Version
|
152
153
|
version: '0'
|
153
154
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
155
|
requirements:
|
155
|
-
- -
|
156
|
+
- - ">="
|
156
157
|
- !ruby/object:Gem::Version
|
157
158
|
version: '0'
|
158
159
|
requirements: []
|
159
160
|
rubyforge_project:
|
160
|
-
rubygems_version: 2.
|
161
|
+
rubygems_version: 2.7.7
|
161
162
|
signing_key:
|
162
163
|
specification_version: 4
|
163
164
|
summary: Extract article from websites.
|