juxtaparssionate 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/README.rdoc +5 -0
- data/lib/parser.rb +43 -0
- data/lib/scraper.rb +23 -0
- data/lib/version.rb +1 -0
- data/test/files/job_posting.txt +1 -0
- data/test/test_parser.rb +32 -0
- metadata +122 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZTY1Y2U2NGMzZjBmNGFhM2I2ZWE5M2VmMmZjOGJiNzI3YTY1M2Q0ZQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
OTg1MTFjNGI2NDEyZDM2ZTU3OWFjNGY2YzhjN2Q3YTE1ZjM4ZjI1OQ==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZTNkODg3MGYyZGM0OGM3OTI3MzVkNGZmZTk5MTA5OTQzMjYxYmZjMmE5NzFm
|
10
|
+
OTEyMGRkOGViMTgwN2Q0YWIxMDhiNmM5ZjUzYmI2NjM1OTFjNzg5MTAwYjQ0
|
11
|
+
M2U1ZmFlN2NlZTU4MzAyMmJiNzZmZGQzNzc3ZWEyYzhkYmQ1Y2U=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZGQ1YjdlMGE5MmUyNTA5MzVlOTcwZGFiMzVmMzRkMGZiNjFkZGY3NzY3YTNh
|
14
|
+
OGI1YjVlYjk0MmMzMDFkMjY4YzZmMjAwMDRjMTkzYWUxNjU5ZWZjN2E2YzA1
|
15
|
+
YzM0Y2UxNjlmMDg3NzRkMDhhNjRkM2VjMjc3ZTA4NThhY2E5NjU=
|
data/README.rdoc
ADDED
data/lib/parser.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'sanitize'
|
2
|
+
require 'treat'
|
3
|
+
|
4
|
+
include Treat::Core::DSL
|
5
|
+
|
6
|
+
class Parser
|
7
|
+
attr_accessor :contents
|
8
|
+
|
9
|
+
def initialize file_path
|
10
|
+
@contents = document "#{file_path}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def strip_tags
|
14
|
+
stripped = Sanitize.fragment @contents
|
15
|
+
stripped.split.join(' ')
|
16
|
+
end
|
17
|
+
|
18
|
+
def extract_nouns
|
19
|
+
words = @contents.apply(:chunk, :segment, :tokenize, :category)
|
20
|
+
words.nouns.map { |w| w.to_s }
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_words
|
24
|
+
chunked = strip_tags.apply(:chunk, :segment, :tokenize)
|
25
|
+
chunked.map { |w| w.to_s }
|
26
|
+
end
|
27
|
+
|
28
|
+
def count_words array
|
29
|
+
@word_count = {}
|
30
|
+
array.each do |word|
|
31
|
+
if @word_count[word] != nil
|
32
|
+
@word_count[word] += 1
|
33
|
+
else
|
34
|
+
@word_count[word] = 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
@word_count
|
38
|
+
end
|
39
|
+
|
40
|
+
def order_hash hash
|
41
|
+
hash.sort_by { |key, value| value }
|
42
|
+
end
|
43
|
+
end
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
class Scraper
|
5
|
+
def we_work
|
6
|
+
agent = Mechanize.new
|
7
|
+
page = agent.get('https://weworkremotely.com/categories/2/jobs')
|
8
|
+
page.links_with( :href => %r{/jobs/} ).each_with_index do |link, index|
|
9
|
+
next if link.href == 'https://weworkremotely.com/jobs/new'
|
10
|
+
page = link.click
|
11
|
+
doc = page.parser
|
12
|
+
extracted_words = doc.css('title').to_s
|
13
|
+
extracted_words << doc.css('div.listing-container').to_s
|
14
|
+
save extracted_words, index
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def save doc, index
|
19
|
+
File.open "postings/job_posting#{index}.txt", 'w' do |f|
|
20
|
+
f.write doc
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/version.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
VERSION = '0.0.1'
|
@@ -0,0 +1 @@
|
|
1
|
+
<div><ul><li><p> ninja <em>10x</em> wanted for ninja douchenozzle team</p></li></ul></div>
|
data/test/test_parser.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require_relative '../lib/parser'
|
3
|
+
|
4
|
+
class TestParser < Minitest::Test
|
5
|
+
def setup
|
6
|
+
@parser = Parser.new 'test/files/job_posting.txt'
|
7
|
+
@hash = { '10x' => 1, 'ninja' => 3, 'douchenozzle' => 2 }
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_that_HTML_tags_are_stripped
|
11
|
+
assert_equal 'ninja 10x wanted for ninja douchenozzle team', @parser.strip_tags
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_that_words_are_extracted
|
15
|
+
assert_equal ["ninja", "10x", "wanted", "for", "ninja", "douchenozzle", "team"], @parser.extract_words
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_that_words_are_counted
|
19
|
+
array = ['10x', 'ninja', 'douchenozzle', 'ninja', 'douchenozzle', 'ninja']
|
20
|
+
assert_equal @hash, @parser.count_words(array)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_that_the_hash_is_ordered
|
24
|
+
array = [['10x', 1], ['douchenozzle', 2], ['ninja', 3]]
|
25
|
+
assert_equal array, @parser.order_hash(@hash)
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_that_nouns_are_extracted
|
29
|
+
assert_equal ["team"], @parser.extract_nouns
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: juxtaparssionate
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Thomas Osborn
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: treat
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rwordnet
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mechanize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: sanitize
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: juxtaparssionate parses and compares job postings from various popular
|
84
|
+
job boards. By comparing word frequency, sentence length, and parts-of-speech counts,
|
85
|
+
juxtaparssionate produces metrics loosely determining the quality and uniqueness
|
86
|
+
of a job posting
|
87
|
+
email: trosborn@gmail.com
|
88
|
+
executables: []
|
89
|
+
extensions: []
|
90
|
+
extra_rdoc_files: []
|
91
|
+
files:
|
92
|
+
- README.rdoc
|
93
|
+
- lib/parser.rb
|
94
|
+
- lib/scraper.rb
|
95
|
+
- lib/version.rb
|
96
|
+
- test/files/job_posting.txt
|
97
|
+
- test/test_parser.rb
|
98
|
+
homepage: http://www.github.com/trosborn/juxtaparssionate
|
99
|
+
licenses:
|
100
|
+
- MIT
|
101
|
+
metadata: {}
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ! '>='
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
requirements: []
|
117
|
+
rubyforge_project:
|
118
|
+
rubygems_version: 2.4.5
|
119
|
+
signing_key:
|
120
|
+
specification_version: 4
|
121
|
+
summary: A tool for parsing and comparing job postings
|
122
|
+
test_files: []
|